From 6c5225ea8669f1a2b2fbc1fc29a47a7d61d20e27 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 12:55:09 +0300
Subject: [PATCH 001/167] [smgr_api] [community] smgr_api.patch

Make smgr API pluggable. Add smgr_hook that can be used to define custom smgrs.
Remove smgrsw[] array and smgr_sw selector. Instead, smgropen() loads
f_smgr implementation using smgr_hook.

Also add smgr_init_hook and smgr_shutdown_hook.
And a lot of mechanical changes in smgr.c functions.

This patch is proposed to community: https://commitfest.postgresql.org/33/3216/

Author: anastasia <lubennikovaav@gmail.com>
---
 src/backend/storage/smgr/smgr.c | 159 +++++++++++++++-----------------
 src/include/storage/smgr.h      |  56 ++++++++++-
 2 files changed, 131 insertions(+), 84 deletions(-)

diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4dc24649df9..b455d07edce 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -18,6 +18,7 @@
 #include "postgres.h"
 
 #include "access/xlog.h"
+#include "catalog/pg_tablespace.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
@@ -26,47 +27,8 @@
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
-
-/*
- * This struct of function pointers defines the API between smgr.c and
- * any individual storage manager module.  Note that smgr subfunctions are
- * generally expected to report problems via elog(ERROR).  An exception is
- * that smgr_unlink should use elog(WARNING), rather than erroring out,
- * because we normally unlink relations during post-commit/abort cleanup,
- * and so it's too late to raise an error.  Also, various conditions that
- * would normally be errors should be allowed during bootstrap and/or WAL
- * recovery --- see comments in md.c for details.
- */
-typedef struct f_smgr
-{
-	void		(*smgr_init) (void);	/* may be NULL */
-	void		(*smgr_shutdown) (void);	/* may be NULL */
-	void		(*smgr_open) (SMgrRelation reln);
-	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
-								bool isRedo);
-	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
-								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, char *buffer, bool skipFsync);
-	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
-								  BlockNumber blocknum);
-	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
-							  BlockNumber blocknum, char *buffer);
-	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
-							   BlockNumber blocknum, char *buffer, bool skipFsync);
-	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
-								   BlockNumber blocknum, BlockNumber nblocks);
-	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
-								  BlockNumber nblocks);
-	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-} f_smgr;
-
-static const f_smgr smgrsw[] = {
+static const f_smgr smgr_md = {
 	/* magnetic disk */
-	{
 		.smgr_init = mdinit,
 		.smgr_shutdown = NULL,
 		.smgr_open = mdopen,
@@ -82,11 +44,8 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
-	}
 };
 
-static const int NSmgr = lengthof(smgrsw);
-
 /*
  * Each backend has a hashtable that stores all extant SMgrRelation objects.
  * In addition, "unowned" SMgrRelation objects are chained together in a list.
@@ -96,7 +55,7 @@ static HTAB *SMgrRelationHash = NULL;
 static dlist_head unowned_relns;
 
 /* local function prototypes */
-static void smgrshutdown(int code, Datum arg);
+//static void smgrshutdown(int code, Datum arg);
 
 
 /*
@@ -110,33 +69,71 @@ static void smgrshutdown(int code, Datum arg);
 void
 smgrinit(void)
 {
-	int			i;
+	if (smgr_init_hook)
+		(*smgr_init_hook)();
 
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_init)
-			smgrsw[i].smgr_init();
-	}
+	smgr_init_standard();
 
-	/* register the shutdown proc */
-	on_proc_exit(smgrshutdown, 0);
+	/*
+	 * ZENITH XXX
+	 * This doesn't work with inmem_smgr, so temporarily disable.
+	 * Anyway, we don't have any real smgrshutdown function.
+	 */
+	// /* register the shutdown proc */
+	// on_proc_exit(smgrshutdown, 0);
 }
 
-/*
- * on_proc_exit hook for smgr cleanup during backend shutdown
- */
-static void
-smgrshutdown(int code, Datum arg)
+//ZENITH XXX See comment above. Silence compiler warning.
+// /*
+//  * on_proc_exit hook for smgr cleanup during backend shutdown
+//  */
+// static void
+// smgrshutdown(int code, Datum arg)
+// {
+// 	if (smgr_shutdown_hook)
+// 		(*smgr_shutdown_hook)();
+
+// 	smgr_shutdown_standard();
+// }
+
+/* Hook for plugins to get control in smgr */
+smgr_hook_type smgr_hook = NULL;
+smgr_init_hook_type smgr_init_hook = NULL;
+smgr_shutdown_hook_type smgr_shutdown_hook = NULL;
+
+const f_smgr *
+smgr_standard(BackendId backend, RelFileNode rnode)
 {
-	int			i;
+	return &smgr_md;
+}
 
-	for (i = 0; i < NSmgr; i++)
+void
+smgr_init_standard(void)
+{
+	mdinit();
+}
+
+void
+smgr_shutdown_standard(void)
+{
+}
+
+const f_smgr *
+smgr(BackendId backend, RelFileNode rnode)
+{
+	const f_smgr *result;
+
+	if (smgr_hook)
 	{
-		if (smgrsw[i].smgr_shutdown)
-			smgrsw[i].smgr_shutdown();
+		result = (*smgr_hook)(backend, rnode);
 	}
+	else
+		result = smgr_standard(backend, rnode);
+
+	return result;
 }
 
+
 /*
  *	smgropen() -- Return an SMgrRelation object, creating it if need be.
  *
@@ -176,10 +173,11 @@ smgropen(RelFileNode rnode, BackendId backend)
 		reln->smgr_targblock = InvalidBlockNumber;
 		for (int i = 0; i <= MAX_FORKNUM; ++i)
 			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
-		reln->smgr_which = 0;	/* we only have md.c at present */
+
+		reln->smgr = smgr(backend, rnode);
 
 		/* implementation-specific initialization */
-		smgrsw[reln->smgr_which].smgr_open(reln);
+		(*reln->smgr).smgr_open(reln);
 
 		/* it has no owner yet */
 		dlist_push_tail(&unowned_relns, &reln->node);
@@ -246,7 +244,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
 bool
 smgrexists(SMgrRelation reln, ForkNumber forknum)
 {
-	return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
+	return (*reln->smgr).smgr_exists(reln, forknum);
 }
 
 /*
@@ -259,7 +257,7 @@ smgrclose(SMgrRelation reln)
 	ForkNumber	forknum;
 
 	for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+		(*reln->smgr).smgr_close(reln, forknum);
 
 	owner = reln->smgr_owner;
 
@@ -332,7 +330,7 @@ smgrclosenode(RelFileNodeBackend rnode)
 void
 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 {
-	smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
+	(*reln->smgr).smgr_create(reln, forknum, isRedo);
 }
 
 /*
@@ -360,12 +358,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels)
 	 */
 	for (i = 0; i < nrels; i++)
 	{
-		int			which = rels[i]->smgr_which;
-
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
-			if (smgrsw[which].smgr_exists(rels[i], forknum))
-				smgrsw[which].smgr_immedsync(rels[i], forknum);
+			if ((*rels[i]->smgr).smgr_exists(rels[i], forknum))
+				(*rels[i]->smgr).smgr_immedsync(rels[i], forknum);
 		}
 	}
 }
@@ -404,13 +400,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 	for (i = 0; i < nrels; i++)
 	{
 		RelFileNodeBackend rnode = rels[i]->smgr_rnode;
-		int			which = rels[i]->smgr_which;
 
 		rnodes[i] = rnode;
 
 		/* Close the forks at smgr level */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			smgrsw[which].smgr_close(rels[i], forknum);
+			(*rels[i]->smgr).smgr_close(rels[i], forknum);
 	}
 
 	/*
@@ -439,10 +434,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 
 	for (i = 0; i < nrels; i++)
 	{
-		int			which = rels[i]->smgr_which;
-
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo);
+			(*rels[i]->smgr).smgr_unlink(rnodes[i], forknum, isRedo);
 	}
 
 	pfree(rnodes);
@@ -462,7 +455,7 @@ void
 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   char *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
+	(*reln->smgr).smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
 
 	/*
@@ -486,7 +479,7 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 bool
 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+	return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum);
 }
 
 /*
@@ -501,7 +494,7 @@ void
 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 char *buffer)
 {
-	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+	(*reln->smgr).smgr_read(reln, forknum, blocknum, buffer);
 }
 
 /*
@@ -523,7 +516,7 @@ void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		  char *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
+	(*reln->smgr).smgr_write(reln, forknum, blocknum,
 										buffer, skipFsync);
 }
 
@@ -536,7 +529,7 @@ void
 smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			  BlockNumber nblocks)
 {
-	smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
+	(*reln->smgr).smgr_writeback(reln, forknum, blocknum,
 											nblocks);
 }
 
@@ -554,7 +547,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 	if (result != InvalidBlockNumber)
 		return result;
 
-	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+	result = (*reln->smgr).smgr_nblocks(reln, forknum);
 
 	reln->smgr_cached_nblocks[forknum] = result;
 
@@ -620,7 +613,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 		/* Make the cached size is invalid if we encounter an error. */
 		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
 
-		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+		(*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]);
 
 		/*
 		 * We might as well update the local smgr_cached_nblocks values. The
@@ -659,7 +652,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 void
 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
+	(*reln->smgr).smgr_immedsync(reln, forknum);
 }
 
 /*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a6fbf7b6a6c..a7c98c7e7fe 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,8 @@
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+struct f_smgr;
+
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
  * cached file handles.  An SMgrRelation is created (if not already present)
@@ -59,7 +61,7 @@ typedef struct SMgrRelationData
 	 * Fields below here are intended to be private to smgr.c and its
 	 * submodules.  Do not touch them from elsewhere.
 	 */
-	int			smgr_which;		/* storage manager selector */
+	const struct f_smgr *smgr;
 
 	/*
 	 * for md.c; per-fork arrays of the number of open segments
@@ -77,6 +79,58 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
 
+
+/*
+ * This struct of function pointers defines the API between smgr.c and
+ * any individual storage manager module.  Note that smgr subfunctions are
+ * generally expected to report problems via elog(ERROR).  An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error.  Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
+ */
+typedef struct f_smgr
+{
+	void		(*smgr_init) (void);	/* may be NULL */
+	void		(*smgr_shutdown) (void);	/* may be NULL */
+	void		(*smgr_open) (SMgrRelation reln);
+	void		(*smgr_close) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_create) (SMgrRelation reln, ForkNumber forknum,
+								bool isRedo);
+	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
+								bool isRedo);
+	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+								BlockNumber blocknum, char *buffer, bool skipFsync);
+	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber blocknum);
+	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
+							  BlockNumber blocknum, char *buffer);
+	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
+							   BlockNumber blocknum, char *buffer, bool skipFsync);
+	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+								   BlockNumber blocknum, BlockNumber nblocks);
+	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
+								  BlockNumber nblocks);
+	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+} f_smgr;
+
+typedef void (*smgr_init_hook_type) (void);
+typedef void (*smgr_shutdown_hook_type) (void);
+extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook;
+extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook;
+extern void smgr_init_standard(void);
+extern void smgr_shutdown_standard(void);
+
+
+typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode);
+extern PGDLLIMPORT smgr_hook_type smgr_hook;
+extern const f_smgr *smgr_standard(BackendId backend, RelFileNode rnode);
+
+extern const f_smgr *smgr(BackendId backend, RelFileNode rnode);
+
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);

From 4dc7b767290206d32881f343e6eedf62cdbc6b49 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 12:58:03 +0300
Subject: [PATCH 002/167] [contrib/zenith] contrib_zenith.patch

Add contrib/zenith that handles interaction with remote pagestore.
To use it add 'shared_preload_library = zenith' to postgresql.conf.

It adds a protocol for network communications - see libpagestore.c;
and implements smgr API.

Also it adds several custom GUC variables:
- zenith.page_server_connstring
- zenith.callmemaybe_connstring
- zenith.zenith_timeline
- zenith.wal_redo

Authors:
Stas Kelvich <stanconn@gmail.com>
Konstantin Knizhnik <knizhnik@garret.ru>
Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 contrib/zenith/Makefile           |  25 +
 contrib/zenith/inmem_smgr.c       | 298 ++++++++++
 contrib/zenith/libpagestore.c     | 258 +++++++++
 contrib/zenith/pagestore_client.h | 151 +++++
 contrib/zenith/pagestore_smgr.c   | 930 ++++++++++++++++++++++++++++++
 contrib/zenith/zenith.control     |   4 +
 6 files changed, 1666 insertions(+)
 create mode 100644 contrib/zenith/Makefile
 create mode 100644 contrib/zenith/inmem_smgr.c
 create mode 100644 contrib/zenith/libpagestore.c
 create mode 100644 contrib/zenith/pagestore_client.h
 create mode 100644 contrib/zenith/pagestore_smgr.c
 create mode 100644 contrib/zenith/zenith.control

diff --git a/contrib/zenith/Makefile b/contrib/zenith/Makefile
new file mode 100644
index 00000000000..ad41c55bd71
--- /dev/null
+++ b/contrib/zenith/Makefile
@@ -0,0 +1,25 @@
+# contrib/zenith/Makefile
+
+
+MODULE_big = zenith
+OBJS = \
+	$(WIN32RES) \
+	inmem_smgr.o libpagestore.o pagestore_smgr.o
+
+PG_CPPFLAGS = -I$(libpq_srcdir)
+SHLIB_LINK_INTERNAL = $(libpq)
+
+EXTENSION = zenith
+PGFILEDESC = "zenith - cloud storage for PostgreSQL"
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+SHLIB_PREREQS = submake-libpq
+subdir = contrib/zenith
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
new file mode 100644
index 00000000000..6ad1e65b04a
--- /dev/null
+++ b/contrib/zenith/inmem_smgr.c
@@ -0,0 +1,298 @@
+/*-------------------------------------------------------------------------
+ *
+ * inmem_smgr.c
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  contrib/zenith/inmem_smgr.c
+ *
+ * TODO cleanup obsolete copy-pasted comments
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "pagestore_client.h"
+#include "utils/hsearch.h"
+#include "access/xlog.h"
+
+typedef struct
+{
+	RelFileNode node;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+}			WrNodeKey;
+
+typedef struct
+{
+	WrNodeKey	tag;
+	char		data[BLCKSZ];
+}			WrNode;
+
+HTAB	   *inmem_files;
+
+/*
+ *	inmem_init() -- Initialize private state
+ */
+void
+inmem_init(void)
+{
+	HASHCTL		hashCtl;
+
+	hashCtl.keysize = sizeof(WrNodeKey);
+	hashCtl.entrysize = sizeof(WrNode);
+
+	if (inmem_files)
+		hash_destroy(inmem_files);
+
+	inmem_files = hash_create("wal-redo files map",
+							  1024,
+							  &hashCtl,
+							  HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ *	inmem_exists() -- Does the physical file exist?
+ */
+bool
+inmem_exists(SMgrRelation reln, ForkNumber forknum)
+{
+	WrNodeKey	key;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = 0;
+	return hash_search(inmem_files,
+					   &key,
+					   HASH_FIND,
+					   NULL) != NULL;
+}
+
+/*
+ *	inmem_create() -- Create a new relation on zenithd storage
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+}
+
+/*
+ *	inmem_unlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
+{
+}
+
+/*
+ *	inmem_extend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+			 char *buffer, bool skipFsync)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = blkno;
+	node = hash_search(inmem_files,
+					   &key,
+					   HASH_ENTER,
+					   NULL);
+	memcpy(node->data, buffer, BLCKSZ);
+}
+
+/*
+ *  inmem_open() -- Initialize newly-opened relation.
+ */
+void
+inmem_open(SMgrRelation reln)
+{
+}
+
+/*
+ *	inmem_close() -- Close the specified relation, if it isn't closed already.
+ */
+void
+inmem_close(SMgrRelation reln, ForkNumber forknum)
+{
+}
+
+/*
+ *	inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	return true;
+}
+
+/*
+ * inmem_writeback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+				BlockNumber blocknum, BlockNumber nblocks)
+{
+}
+
+/*
+ *	inmem_read() -- Read the specified block from a relation.
+ */
+void
+inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+		   char *buffer)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = blkno;
+	node = hash_search(inmem_files,
+					   &key,
+					   HASH_FIND,
+					   NULL);
+	if (node != NULL)
+		memcpy(buffer, node->data, BLCKSZ);
+	else
+		memset(buffer, 0, BLCKSZ);
+}
+
+/*
+ *	inmem_write() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			char *buffer, bool skipFsync)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = blocknum;
+	node = hash_search(inmem_files,
+					   &key,
+					   HASH_ENTER,
+					   NULL);
+	memcpy(node->data, buffer, BLCKSZ);
+}
+
+/*
+ *	inmem_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	WrNodeKey	key;
+	WrNode	   *node;
+
+	key.node = reln->smgr_rnode.node;
+	key.forknum = forknum;
+	key.blkno = 0;
+
+	while (true)
+	{
+		node = hash_search(inmem_files,
+						   &key,
+						   HASH_FIND,
+						   NULL);
+		if (node == NULL)
+			return key.blkno;
+		key.blkno += 1;
+	}
+}
+
+/*
+ *	inmem_truncate() -- Truncate relation to specified number of blocks.
+ */
+void
+inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+}
+
+/*
+ *	inmem_immedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+}
+static const struct f_smgr inmem_smgr =
+{
+	.smgr_init = inmem_init,
+	.smgr_shutdown = NULL,
+	.smgr_open = inmem_open,
+	.smgr_close = inmem_close,
+	.smgr_create = inmem_create,
+	.smgr_exists = inmem_exists,
+	.smgr_unlink = inmem_unlink,
+	.smgr_extend = inmem_extend,
+	.smgr_prefetch = inmem_prefetch,
+	.smgr_read = inmem_read,
+	.smgr_write = inmem_write,
+	.smgr_writeback = inmem_writeback,
+	.smgr_nblocks = inmem_nblocks,
+	.smgr_truncate = inmem_truncate,
+	.smgr_immedsync = inmem_immedsync,
+};
+
+const f_smgr *
+smgr_inmem(BackendId backend, RelFileNode rnode)
+{
+	if (backend != InvalidBackendId && !InRecovery)
+		return smgr_standard(backend, rnode);
+	else
+	{
+		return &inmem_smgr;
+	}
+}
+
+void
+smgr_init_inmem()
+{
+	inmem_init();
+}
diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
new file mode 100644
index 00000000000..062f0cbf2e0
--- /dev/null
+++ b/contrib/zenith/libpagestore.c
@@ -0,0 +1,258 @@
+/*-------------------------------------------------------------------------
+ *
+ * libpqpagestore.c
+ *	  Handles network communications with the remote pagestore.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith/libpqpagestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pagestore_client.h"
+#include "fmgr.h"
+#include "access/xlog.h"
+
+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "libpq/libpq.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "utils/guc.h"
+
+#include "replication/walproposer.h"
+
+PG_MODULE_MAGIC;
+
+void		_PG_init(void);
+
+#define PqPageStoreTrace DEBUG5
+
+#define ZENITH_TAG "[ZENITH_SMGR] "
+#define zenith_log(tag, fmt, ...) ereport(tag, \
+		(errmsg(ZENITH_TAG fmt, ## __VA_ARGS__), \
+		 errhidestmt(true), errhidecontext(true)))
+
+bool		connected = false;
+PGconn	   *pageserver_conn;
+
+static ZenithResponse * zenith_call(ZenithRequest request);
+page_server_api api = {
+	.request = zenith_call
+};
+
+static void
+zenith_connect()
+{
+	char	   *query;
+	int			ret;
+
+	pageserver_conn = PQconnectdb(page_server_connstring);
+
+	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		PQfinish(pageserver_conn);
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("[ZENITH_SMGR] could not establish connection"),
+				 errdetail_internal("%s", msg)));
+	}
+
+	/* Ask the Page Server to connect to us, and stream WAL from us. */
+	if (callmemaybe_connstring && callmemaybe_connstring[0])
+	{
+		PGresult   *res;
+
+		query = psprintf("callmemaybe %s %s", zenith_timeline, callmemaybe_connstring);
+		res = PQexec(pageserver_conn, query);
+		if (PQresultStatus(res) != PGRES_COMMAND_OK)
+		{
+			zenith_log(ERROR,
+					   "[ZENITH_SMGR] callmemaybe command failed");
+		}
+		PQclear(res);
+	}
+
+	query = psprintf("pagestream %s", zenith_timeline);
+	ret = PQsendQuery(pageserver_conn, query);
+	if (ret != 1)
+		zenith_log(ERROR,
+				   "[ZENITH_SMGR] failed to start dispatcher_loop on pageserver");
+
+	while (PQisBusy(pageserver_conn))
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(pageserver_conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(pageserver_conn))
+				zenith_log(ERROR, "[ZENITH_SMGR] failed to get handshake from pageserver: %s",
+						   PQerrorMessage(pageserver_conn));
+		}
+	}
+
+	zenith_log(LOG, "libpqpagestore: connected to '%s'", page_server_connstring);
+
+	connected = true;
+}
+
+
+static ZenithResponse *
+zenith_call(ZenithRequest request)
+{
+	StringInfoData req_buff;
+	StringInfoData resp_buff;
+	ZenithMessage *resp;
+
+	/* If the connection was lost for some reason, reconnect */
+	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
+		connected = false;
+	}
+
+	if (!connected)
+		zenith_connect();
+
+	req_buff = zm_pack((ZenithMessage *) & request);
+
+	/* send request */
+	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
+	{
+		zenith_log(ERROR, "failed to send page request: %s",
+				   PQerrorMessage(pageserver_conn));
+	}
+	pfree(req_buff.data);
+
+	{
+		char	   *msg = zm_to_string((ZenithMessage *) & request);
+
+		zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
+		pfree(msg);
+	}
+
+	/* read response */
+	resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
+	resp_buff.cursor = 0;
+
+	if (resp_buff.len == -1)
+		zenith_log(ERROR, "end of COPY");
+	else if (resp_buff.len == -2)
+		zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+
+	resp = zm_unpack(&resp_buff);
+	PQfreemem(resp_buff.data);
+
+	Assert(messageTag(resp) == T_ZenithStatusResponse
+		   || messageTag(resp) == T_ZenithNblocksResponse
+		   || messageTag(resp) == T_ZenithReadResponse);
+
+	{
+		char	   *msg = zm_to_string((ZenithMessage *) & request);
+
+		zenith_log(PqPageStoreTrace, "Got response: %s", msg);
+		pfree(msg);
+	}
+
+
+	/*
+	 * XXX: zm_to_string leak strings. Check with what memory contex all this
+	 * methods are called.
+	 */
+
+	return (ZenithResponse *) resp;
+}
+
+
+static bool
+check_zenith_timeline(char **newval, void **extra, GucSource source)
+{
+	uint8		ztimelineid[16];
+
+	return **newval == '\0' || HexDecodeString(ztimelineid, *newval, 16);
+}
+
+/*
+ * Module initialization function
+ */
+void
+_PG_init(void)
+{
+	DefineCustomStringVariable("zenith.page_server_connstring",
+							   "connection string to the page server",
+							   NULL,
+							   &page_server_connstring,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   NULL, NULL, NULL);
+
+	DefineCustomStringVariable("zenith.callmemaybe_connstring",
+							   "Connection string that Page Server or WAL safekeeper should use to connect to us",
+							   NULL,
+							   &callmemaybe_connstring,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   NULL, NULL, NULL);
+
+	DefineCustomStringVariable("zenith.zenith_timeline",
+							   "Zenith timelineid the server is running on",
+							   NULL,
+							   &zenith_timeline,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_zenith_timeline, NULL, NULL);
+
+	DefineCustomBoolVariable("zenith.wal_redo",
+							 "start in wal-redo mode",
+							 NULL,
+							 &wal_redo,
+							 false,
+							 PGC_POSTMASTER,
+							 0,
+							 NULL, NULL, NULL);
+
+	if (page_server != NULL)
+		zenith_log(ERROR, "libpqpagestore already loaded");
+
+	zenith_log(PqPageStoreTrace, "libpqpagestore already loaded");
+	page_server = &api;
+
+	/* Is there more correct way to pass CustomGUC to postgres code? */
+	zenith_timeline_walproposer = zenith_timeline;
+
+	if (wal_redo)
+	{
+		zenith_log(PqPageStoreTrace, "set inmem_smgr hook");
+		smgr_hook = smgr_inmem;
+		smgr_init_hook = smgr_init_inmem;
+	}
+	else if (page_server_connstring && page_server_connstring[0])
+	{
+		zenith_log(PqPageStoreTrace, "set zenith_smgr hook");
+		smgr_hook = smgr_zenith;
+		smgr_init_hook = smgr_init_zenith;
+	}
+}
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
new file mode 100644
index 00000000000..400fb259a6b
--- /dev/null
+++ b/contrib/zenith/pagestore_client.h
@@ -0,0 +1,151 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagestore_client.h
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * contrib/zenith/pagestore_client.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef pageserver_h
+#define pageserver_h
+
+#include "postgres.h"
+
+#include "access/xlogdefs.h"
+#include "storage/relfilenode.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "utils/memutils.h"
+
+#include "pg_config.h"
+
+typedef enum
+{
+	/* pagestore_client -> pagestore */
+	T_ZenithExistsRequest = 0,
+	T_ZenithNblocksRequest,
+	T_ZenithReadRequest,
+
+	/* pagestore -> pagestore_client */
+	T_ZenithStatusResponse = 100,
+	T_ZenithNblocksResponse,
+	T_ZenithReadResponse,
+}			ZenithMessageTag;
+
+
+/* base struct for c-style inheritance */
+typedef struct
+{
+	ZenithMessageTag tag;
+}			ZenithMessage;
+
+#define messageTag(m)		(((const ZenithMessage *)(m))->tag)
+
+extern char const *const ZenithMessageStr[];
+
+typedef struct
+{
+	RelFileNode rnode;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+}			PageKey;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	uint64		system_id;
+	PageKey		page_key;
+	XLogRecPtr	lsn;			/* request page version @ this LSN */
+}			ZenithRequest;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		ok;
+	uint32		n_blocks;
+	char		page[1];
+}			ZenithResponse;
+
+StringInfoData zm_pack(ZenithMessage * msg);
+ZenithMessage *zm_unpack(StringInfo s);
+char	   *zm_to_string(ZenithMessage * msg);
+
+/*
+ * API
+ */
+
+typedef struct
+{
+	ZenithResponse *(*request) (ZenithRequest request);
+}			page_server_api;
+
+extern page_server_api * page_server;
+
+extern char *page_server_connstring;
+extern char *callmemaybe_connstring;
+extern char *zenith_timeline;
+extern bool wal_redo;
+
+extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
+extern void smgr_init_zenith(void);
+
+extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
+extern void smgr_init_inmem(void);
+extern void smgr_shutdown_inmem(void);
+
+/* zenith storage manager functionality */
+
+extern void zenith_init(void);
+extern void zenith_open(SMgrRelation reln);
+extern void zenith_close(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void zenith_extend(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum);
+extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+						char *buffer);
+extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
+							 BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber nblocks);
+extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+extern bool zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum);
+extern void zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum);
+
+/* zenith wal-redo storage manager functionality */
+
+extern void inmem_init(void);
+extern void inmem_open(SMgrRelation reln);
+extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum);
+extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					   char *buffer);
+extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
+						BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
+							BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber nblocks);
+extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+#endif
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
new file mode 100644
index 00000000000..3a91d80b926
--- /dev/null
+++ b/contrib/zenith/pagestore_smgr.c
@@ -0,0 +1,930 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagestore_smgr.c
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  contrib/zenith/pagestore_smgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "pagestore_client.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "access/xlogdefs.h"
+#include "storage/bufmgr.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "replication/walsender.h"
+#include "catalog/pg_tablespace_d.h"
+
+/*
+ * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
+ * calls to md.c, and *also* do the calls to the Page Server. On every
+ * read, compare the versions we read from local disk and Page Server,
+ * and Assert that they are identical.
+ */
+/* #define DEBUG_COMPARE_LOCAL */
+
+#ifdef DEBUG_COMPARE_LOCAL
+#include "access/nbtree.h"
+#include "storage/bufpage.h"
+#include "storage/md.h"
+#include "access/xlog_internal.h"
+
+static char *hexdump_page(char *page);
+#endif
+
+const int SmgrTrace = DEBUG5;
+
+bool loaded = false;
+
+page_server_api *page_server;
+
+/* GUCs */
+char *page_server_connstring;
+char *callmemaybe_connstring;
+char *zenith_timeline;
+bool wal_redo = false;
+
+char const *const ZenithMessageStr[] =
+{
+	"ZenithExistsRequest",
+	"ZenithNblocksRequest",
+	"ZenithReadRequest",
+	"ZenithStatusResponse",
+	"ZenithReadResponse",
+	"ZenithNblocksResponse",
+};
+
+StringInfoData
+zm_pack(ZenithMessage *msg)
+{
+	StringInfoData	s;
+
+	initStringInfo(&s);
+	pq_sendbyte(&s, msg->tag);
+
+	switch (messageTag(msg))
+	{
+		/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		{
+			ZenithRequest *msg_req = (ZenithRequest *) msg;
+
+			pq_sendint32(&s, msg_req->page_key.rnode.spcNode);
+			pq_sendint32(&s, msg_req->page_key.rnode.dbNode);
+			pq_sendint32(&s, msg_req->page_key.rnode.relNode);
+			pq_sendbyte(&s, msg_req->page_key.forknum);
+			pq_sendint32(&s, msg_req->page_key.blkno);
+			pq_sendint64(&s, msg_req->lsn);
+
+			break;
+		}
+
+		/* pagestore -> pagestore_client */
+		case T_ZenithStatusResponse:
+		case T_ZenithNblocksResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			pq_sendbyte(&s, msg_resp->ok);
+			pq_sendint32(&s, msg_resp->n_blocks);
+			break;
+		}
+		case T_ZenithReadResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			pq_sendbyte(&s, msg_resp->ok);
+			pq_sendint32(&s, msg_resp->n_blocks);
+			pq_sendbytes(&s, msg_resp->page, BLCKSZ); // XXX: should be varlena
+			break;
+		}
+	}
+	return s;
+}
+
+ZenithMessage *
+zm_unpack(StringInfo s)
+{
+	ZenithMessageTag tag = pq_getmsgbyte(s);
+	ZenithMessage *msg = NULL;
+
+	switch (tag)
+	{
+		/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		{
+			ZenithRequest *msg_req = palloc0(sizeof(ZenithRequest));
+
+			msg_req->tag = tag;
+			msg_req->system_id = 42;
+			msg_req->page_key.rnode.spcNode = pq_getmsgint(s, 4);
+			msg_req->page_key.rnode.dbNode = pq_getmsgint(s, 4);
+			msg_req->page_key.rnode.relNode = pq_getmsgint(s, 4);
+			msg_req->page_key.forknum = pq_getmsgbyte(s);
+			msg_req->page_key.blkno = pq_getmsgint(s, 4);
+			msg_req->lsn = pq_getmsgint64(s);
+			pq_getmsgend(s);
+
+			msg = (ZenithMessage *) msg_req;
+			break;
+		}
+
+		/* pagestore -> pagestore_client */
+		case T_ZenithStatusResponse:
+		case T_ZenithNblocksResponse:
+		{
+			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse));
+
+			msg_resp->tag = tag;
+			msg_resp->ok = pq_getmsgbyte(s);
+			msg_resp->n_blocks = pq_getmsgint(s, 4);
+			pq_getmsgend(s);
+
+			msg = (ZenithMessage *) msg_resp;
+			break;
+		}
+
+		case T_ZenithReadResponse:
+		{
+			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse) + BLCKSZ);
+
+			msg_resp->tag = tag;
+			msg_resp->ok = pq_getmsgbyte(s);
+			msg_resp->n_blocks = pq_getmsgint(s, 4);
+			memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); // XXX: should be varlena
+			pq_getmsgend(s);
+
+			msg = (ZenithMessage *) msg_resp;
+			break;
+		}
+	}
+
+	return msg;
+}
+
+/* dump to json for debugging / error reporting purposes */
+char *
+zm_to_string(ZenithMessage *msg)
+{
+	StringInfoData	s;
+
+	initStringInfo(&s);
+
+	appendStringInfoString(&s, "{");
+	appendStringInfo(&s, "\"type\": \"%s\"", ZenithMessageStr[msg->tag]);
+
+	switch (messageTag(msg))
+	{
+		/* pagestore_client -> pagestore */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		{
+			ZenithRequest *msg_req = (ZenithRequest *) msg;
+
+			appendStringInfo(&s, ", \"page_key\": \"%d.%d.%d.%d.%u\", \"lsn\": \"%X/%X\"}",
+							 msg_req->page_key.rnode.spcNode,
+							 msg_req->page_key.rnode.dbNode,
+							 msg_req->page_key.rnode.relNode,
+							 msg_req->page_key.forknum,
+							 msg_req->page_key.blkno,
+							 (uint32) (msg_req->lsn >> 32), (uint32) (msg_req->lsn));
+
+			break;
+		}
+
+		/* pagestore -> pagestore_client */
+		case T_ZenithStatusResponse:
+		case T_ZenithNblocksResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+
+			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u}",
+				msg_resp->ok,
+				msg_resp->n_blocks
+			);
+
+			break;
+		}
+		case T_ZenithReadResponse:
+		{
+			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+
+			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u, \"page\": \"XXX\"}",
+				msg_resp->ok,
+				msg_resp->n_blocks
+			);
+			break;
+		}
+	}
+	return s.data;
+}
+
+
+static void
+zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
+{
+	XLogRecPtr lsn = PageGetLSN(buffer);
+
+	/*
+	 * If the page was not WAL-logged before eviction then we can lose its modification.
+	 * PD_WAL_LOGGED bit is used to mark pages which are wal-logged.
+	 *
+	 * See also comments to PD_WAL_LOGGED.
+	 *
+	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the whole index .
+	 * That's duplicative with the WAL-logging that we do here.
+	 * See log_newpage_range() calls.
+	 *
+	 * FIXME: Redoing this record will set the LSN on the page. That could
+	 * mess up the LSN-NSN interlock in GiST index build.
+	 */
+	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
+	{
+		/* FSM is never WAL-logged and we don't care. */
+		XLogRecPtr recptr;
+		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		XLogFlush(recptr);
+		lsn = recptr;
+		elog(SmgrTrace, "FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	}
+	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
+	{
+		/*
+		 * Always WAL-log vm.
+		 * We should never miss clearing visibility map bits.
+		 *
+		 * TODO Is it too bad for performance?
+		 * Hopefully we do not evict actively used vm too often.
+		 */
+		XLogRecPtr recptr;
+		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		XLogFlush(recptr);
+		lsn = recptr;
+
+		elog(SmgrTrace, "Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	}
+	else if (!(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED)
+		&& !RecoveryInProgress())
+	{
+		XLogRecPtr recptr;
+		/*
+		 * We assume standard page layout here.
+		 *
+		 * But at smgr level we don't really know what kind of a page this is.
+		 * We have filtered visibility map pages and fsm pages above.
+		 * TODO Do we have any special page types?
+		 */
+
+		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
+
+		/* If we wal-log hint bits, someone could concurrently update page
+		 * and reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
+		 *
+		 * See comment to FlushBuffer().
+		 * The caller must hold a pin on the buffer and have share-locked the
+		 * buffer contents.  (Note: a share-lock does not prevent updates of
+		 * hint bits in the buffer, so the page could change while the write
+		 * is in progress, but we assume that that will not invalidate the data
+		 * written.)
+		 */
+		Assert(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED); /* Should be set by log_newpage */
+
+		/*
+		 * Need to flush it too, so that it gets sent to the Page Server before we
+		 * might need to read it back. It should get flushed eventually anyway, at
+		 * least if there is some other WAL activity, so this isn't strictly
+		 * necessary for correctness. But if there is no other WAL activity, the
+		 * page read might get stuck waiting for the record to be streamed out
+		 * for an indefinite time.
+		 *
+		 * FIXME: Flushing the WAL is expensive. We should track the last "evicted"
+		 * LSN instead, and update it here. Or just kick the bgwriter to do the
+		 * flush, there is no need for us to block here waiting for it to finish.
+		 */
+		XLogFlush(recptr);
+		lsn = recptr;
+		elog(SmgrTrace, "Force wal logging of page %u of relation %u/%u/%u.%u, lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	} else {
+		elog(SmgrTrace, "Page %u of relation %u/%u/%u.%u is alread wal logged at lsn=%X",
+			 blocknum,
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, (uint32)lsn);
+	}
+	SetLastWrittenPageLSN(lsn);
+}
+
+
+
+/*
+ *	zenith_init() -- Initialize private state
+ */
+void
+zenith_init(void)
+{
+	/* noop */
+#ifdef DEBUG_COMPARE_LOCAL
+	mdinit();
+#endif
+}
+
+
+/*
+ * Return LSN for requesting pages and number of blocks from page server
+ */
+static XLogRecPtr
+zenith_get_request_lsn(bool nonrel)
+{
+	XLogRecPtr lsn;
+	XLogRecPtr flushlsn;
+
+	if (RecoveryInProgress())
+	{
+		lsn = GetXLogReplayRecPtr(NULL);
+		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
+			(uint32) ((lsn) >> 32), (uint32) (lsn));
+
+		lsn = InvalidXLogRecPtr;
+	}
+	else if (am_walsender)
+	{
+		lsn = InvalidXLogRecPtr;
+		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
+	}
+	else if (nonrel)
+	{
+		lsn = GetFlushRecPtr();
+		elog(DEBUG1, "zenith_get_request_lsn norel GetFlushRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
+	}
+	else
+	{
+		lsn = GetLastWrittenPageLSN();
+		flushlsn = GetFlushRecPtr();
+
+		/*
+		 * Use the latest LSN that was evicted from the buffer cache. Any
+		 * pages modified by later WAL records must still in the buffer cache,
+		 * so our request cannot concern those.
+		 */
+		lsn = GetLastWrittenPageLSN();
+		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
+			(uint32) ((lsn) >> 32), (uint32) (lsn));
+
+		if (lsn == InvalidXLogRecPtr)
+		{
+			/*
+			 * We haven't evicted anything yet since the server was
+			 * started. Then just use the latest flushed LSN. That's always
+			 * safe, using the latest evicted LSN is really just an
+			 * optimization.
+			 */
+			lsn = flushlsn;
+			elog(DEBUG1, "zenith_get_request_lsn GetFlushRecPtr lsn %X/%X",
+				 (uint32) ((lsn) >> 32), (uint32) (lsn));
+		}
+
+		/*
+		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
+		 * we shouldn't evict a page from the buffer cache before all its modifications have
+		 * been safely flushed. That's the "WAL before data" rule. But better safe than sorry.
+		 */
+		if (lsn > flushlsn)
+		{
+			elog(LOG, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+				 (uint32) (lsn >> 32), (uint32) lsn,
+				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
+			XLogFlush(lsn);
+		}
+	}
+	return lsn;
+}
+
+
+/*
+ *	zenith_exists() -- Does the physical file exist?
+ */
+bool
+zenith_exists(SMgrRelation reln, ForkNumber forkNum)
+{
+	bool		ok;
+	ZenithResponse *resp;
+
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithExistsRequest,
+		.page_key = {
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forkNum
+		},
+		.lsn = zenith_get_request_lsn(false)
+	});
+	ok = resp->ok;
+	pfree(resp);
+	return ok;
+}
+
+/*
+ *	zenith_create() -- Create a new relation on zenithd storage
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
+{
+	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forkNum);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdcreate(reln, forkNum, isRedo);
+#endif
+}
+
+/*
+ *	zenith_unlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileNodeBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
+{
+#ifdef DEBUG_COMPARE_LOCAL
+	mdunlink(rnode, forkNum, isRedo);
+#endif
+}
+
+/*
+ *	zenith_extend() -- Add a block to the specified relation.
+ *
+ *		The semantics are nearly the same as mdwrite(): write at the
+ *		specified position.  However, this is to be used for the case of
+ *		extending a relation (i.e., blocknum is at or beyond the current
+ *		EOF).  Note that we assume writing a block beyond current EOF
+ *		causes intervening file space to become filled with zeroes.
+ */
+void
+zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			  char *buffer, bool skipFsync)
+{
+	XLogRecPtr lsn;
+
+	zenith_wallog_page(reln, forkNum, blkno, buffer);
+
+	lsn = PageGetLSN(buffer);
+	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forkNum, blkno,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdextend(reln, forkNum, blkno, buffer, skipFsync);
+#endif
+}
+
+/*
+ *  zenith_open() -- Initialize newly-opened relation.
+ */
+void
+zenith_open(SMgrRelation reln)
+{
+	/* no work */
+	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdopen(reln);
+#endif
+}
+
+/*
+ *	zenith_close() -- Close the specified relation, if it isn't closed already.
+ */
+void
+zenith_close(SMgrRelation reln, ForkNumber forknum)
+{
+	/* no work */
+	elog(SmgrTrace, "[ZENITH_SMGR] close noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdclose(reln, forknum);
+#endif
+}
+
+/*
+ *	zenith_prefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	/* not implemented */
+	elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop");
+	return true;
+}
+
+/*
+ * zenith_writeback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+zenith_writeback(SMgrRelation reln, ForkNumber forknum,
+					  BlockNumber blocknum, BlockNumber nblocks)
+{
+	/* not implemented */
+	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdwriteback(reln, forknum, blocknum, nblocks);
+#endif
+}
+
+/*
+ *	zenith_read() -- Read the specified block from a relation.
+ */
+void
+zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+				 char *buffer)
+{
+	ZenithResponse *resp;
+	XLogRecPtr request_lsn;
+
+	request_lsn = zenith_get_request_lsn(false);
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithReadRequest,
+		.page_key = {
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forkNum,
+			.blkno = blkno
+		},
+		.lsn = request_lsn
+	});
+
+	if (!resp->ok)
+		ereport(ERROR,
+			(errcode(ERRCODE_IO_ERROR),
+			errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+					blkno,
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode,
+					forkNum,
+					(uint32) (request_lsn >> 32), (uint32) request_lsn)));
+
+	memcpy(buffer, resp->page, BLCKSZ);
+	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED; /* Clear PD_WAL_LOGGED bit stored in WAL record */
+	pfree(resp);
+
+
+#ifdef DEBUG_COMPARE_LOCAL
+	if (forkNum == MAIN_FORKNUM)
+	{
+		char pageserver_masked[BLCKSZ];
+		char mdbuf[BLCKSZ];
+		char mdbuf_masked[BLCKSZ];
+
+		mdread(reln, forkNum, blkno, mdbuf);
+
+		memcpy(pageserver_masked, buffer, BLCKSZ);
+		memcpy(mdbuf_masked, mdbuf, BLCKSZ);
+
+		if (PageIsNew(mdbuf)) {
+			if (!PageIsNew(pageserver_masked)) {
+				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(buffer));
+			}
+		}
+		else if (PageIsNew(buffer)) {
+			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf));
+		}
+		else if (PageGetSpecialSize(mdbuf) == 0)
+		{
+			// assume heap
+			RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
+			RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
+
+			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+					 blkno,
+					 reln->smgr_rnode.node.spcNode,
+					 reln->smgr_rnode.node.dbNode,
+					 reln->smgr_rnode.node.relNode,
+					 forkNum,
+					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+					 hexdump_page(mdbuf_masked),
+					 hexdump_page(pageserver_masked));
+			}
+		}
+		else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData)))
+		{
+			if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
+			{
+				// assume btree
+				RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
+				RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
+
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
+						 blkno,
+						 reln->smgr_rnode.node.spcNode,
+						 reln->smgr_rnode.node.dbNode,
+						 reln->smgr_rnode.node.relNode,
+						 forkNum,
+						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+						 hexdump_page(mdbuf_masked),
+						 hexdump_page(pageserver_masked));
+				}
+			}
+		}
+	}
+#endif
+}
+
+#ifdef DEBUG_COMPARE_LOCAL
+static char *
+hexdump_page(char *page)
+{
+	StringInfoData result;
+
+	initStringInfo(&result);
+
+	for (int i = 0; i < BLCKSZ; i++)
+	{
+		if (i % 8 == 0)
+			appendStringInfo(&result, " ");
+		if (i % 40 == 0)
+			appendStringInfo(&result, "\n");
+		appendStringInfo(&result, "%02x", (unsigned char)(page[i]));
+	}
+
+	return result.data;
+}
+#endif
+
+
+bool
+zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum)
+{
+	bool ok;
+	ZenithResponse *resp;
+
+	elog(SmgrTrace, "[ZENITH_SMGR] zenith_nonrel_page_exists relnode %u/%u/%u_%d blkno %u",
+		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno);
+
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithExistsRequest,
+		.page_key = {
+			.rnode = rnode,
+			.forknum = forknum,
+			.blkno = blkno
+		},
+		.lsn = zenith_get_request_lsn(true)
+	});
+	ok = resp->ok;
+	pfree(resp);
+	return ok;
+}
+
+void
+zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum)
+{
+	int bufsize = BLCKSZ;
+	ZenithResponse *resp;
+	XLogRecPtr lsn;
+
+	//43 is magic for RELMAPPER_FILENAME in page cache
+	// relmapper files has non-standard size of 512bytes
+	if (forknum == 43)
+		bufsize = 512;
+
+	lsn = zenith_get_request_lsn(true);
+
+	elog(SmgrTrace, "[ZENITH_SMGR] read nonrel relnode %u/%u/%u_%d blkno %u lsn %X/%X",
+		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno,
+		(uint32) ((lsn) >> 32), (uint32) (lsn));
+
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithReadRequest,
+		.page_key = {
+			.rnode = rnode,
+			.forknum = forknum,
+			.blkno = blkno
+		},
+		.lsn = lsn
+	});
+
+	if (!resp->ok)
+		elog(ERROR, "[ZENITH_SMGR] smgr page not found");
+
+	memcpy(buffer, resp->page, bufsize);
+	pfree(resp);
+}
+
+
+/*
+ *	zenith_write() -- Write the supplied block at the appropriate location.
+ *
+ *		This is to be used only for updating already-existing blocks of a
+ *		relation (ie, those before the current EOF).  To extend a relation,
+ *		use mdextend().
+ */
+void
+zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			 char *buffer, bool skipFsync)
+{
+	XLogRecPtr lsn;
+
+	zenith_wallog_page(reln, forknum, blocknum, buffer);
+
+	lsn = PageGetLSN(buffer);
+	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forknum, blocknum,
+		 (uint32) (lsn >> 32), (uint32) lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+}
+
+/*
+ *	zenith_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	ZenithResponse *resp;
+	int			n_blocks;
+	XLogRecPtr request_lsn;
+
+	request_lsn = zenith_get_request_lsn(false);
+	resp = page_server->request((ZenithRequest) {
+		.tag = T_ZenithNblocksRequest,
+		.page_key = {
+			.rnode = reln->smgr_rnode.node,
+			.forknum = forknum,
+		},
+		.lsn = request_lsn
+	});
+	n_blocks = resp->n_blocks;
+
+	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+		 reln->smgr_rnode.node.spcNode,
+		 reln->smgr_rnode.node.dbNode,
+		 reln->smgr_rnode.node.relNode,
+		 forknum,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 n_blocks);
+
+	pfree(resp);
+	return n_blocks;
+}
+
+/*
+ *	zenith_truncate() -- Truncate relation to specified number of blocks.
+ */
+void
+zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+	XLogRecPtr lsn;
+
+	/*
+	 * Truncating a relation drops all its buffers from the buffer cache without
+	 * calling smgrwrite() on them. But we must account for that in our tracking
+	 * of last-written-LSN all the same: any future smgrnblocks() request must
+	 * return the new size after the truncation. We don't know what the LSN of
+	 * the truncation record was, so be conservative and use the most recently
+	 * inserted WAL record's LSN.
+	 */
+	lsn = GetXLogInsertRecPtr();
+
+	/*
+	 * Flush it, too. We don't actually care about it here, but let's uphold
+	 * the invariant that last-written LSN <= flush LSN.
+	 */
+	XLogFlush(lsn);
+
+	SetLastWrittenPageLSN(lsn);
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdtruncate(reln, forknum, nblocks);
+#endif
+}
+
+/*
+ *	zenith_immedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.  We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive.  If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
+
+#ifdef DEBUG_COMPARE_LOCAL
+	mdimmedsync(reln, forknum);
+#endif
+}
+
+static const struct f_smgr zenith_smgr =
+{
+	.smgr_init = zenith_init,
+	.smgr_shutdown = NULL,
+	.smgr_open = zenith_open,
+	.smgr_close = zenith_close,
+	.smgr_create = zenith_create,
+	.smgr_exists = zenith_exists,
+	.smgr_unlink = zenith_unlink,
+	.smgr_extend = zenith_extend,
+	.smgr_prefetch = zenith_prefetch,
+	.smgr_read = zenith_read,
+	.smgr_write = zenith_write,
+	.smgr_writeback = zenith_writeback,
+	.smgr_nblocks = zenith_nblocks,
+	.smgr_truncate = zenith_truncate,
+	.smgr_immedsync = zenith_immedsync,
+};
+
+
+const f_smgr *
+smgr_zenith(BackendId backend, RelFileNode rnode)
+{
+
+	/* Don't use page server for temp relations */
+	if (backend != InvalidBackendId)
+		return smgr_standard(backend, rnode);
+	else
+		return &zenith_smgr;
+}
+
+void
+smgr_init_zenith(void)
+{
+	zenith_init();
+}
diff --git a/contrib/zenith/zenith.control b/contrib/zenith/zenith.control
new file mode 100644
index 00000000000..9aa5e2f067a
--- /dev/null
+++ b/contrib/zenith/zenith.control
@@ -0,0 +1,4 @@
+# zenith extension
+comment = 'cloud storage for PostgreSQL'
+default_version = '1.0'
+module_pathname = '$libdir/zenith'

From 7448697eeaa171613761fecb6959bffb13ec5aa2 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:13:11 +0300
Subject: [PATCH 003/167] [walredo] zenith_wal_redo.patch

Add WAL redo helper for zenith - alternative postgres operation mode to replay wal by pageserver request.

To start postgres in wal-redo mode, run postgres with --wal-redo option
It requires zenith shared library and zenith.wal_redo

Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/access/transam/xlog.c      |  14 +-
 src/backend/access/transam/xlogutils.c |  17 +
 src/backend/main/main.c                |   4 +
 src/backend/tcop/Makefile              |   2 +
 src/backend/tcop/zenith_wal_redo.c     | 647 +++++++++++++++++++++++++
 src/include/access/xlogutils.h         |   2 +
 src/include/tcop/tcopprot.h            |   4 +
 7 files changed, 688 insertions(+), 2 deletions(-)
 create mode 100644 src/backend/tcop/zenith_wal_redo.c

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6208e123e5d..34d83f3e702 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10474,10 +10474,20 @@ xlog_redo(XLogReaderState *record)
 		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
 		{
 			Buffer		buffer;
+			XLogRedoAction result;
 
-			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
+			result = XLogReadBufferForRedo(record, block_id, &buffer);
+			if (result == BLK_DONE && !IsUnderPostmaster)
+			{
+				/*
+				 * In the special WAL process, blocks that are being ignored
+				 * return BLK_DONE. Accept that.
+				 */
+			}
+			else if (result != BLK_RESTORED)
 				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
-			UnlockReleaseBuffer(buffer);
+			if (buffer != InvalidBuffer)
+				UnlockReleaseBuffer(buffer);
 		}
 	}
 	else if (info == XLOG_BACKUP_END)
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f460..baf4dbed4aa 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -31,6 +31,8 @@
 #include "utils/rel.h"
 
 
+bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 /* GUC variable */
 bool		ignore_invalid_pages = false;
 
@@ -345,6 +347,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 		elog(PANIC, "failed to locate backup block with ID %d", block_id);
 	}
 
+	if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id))
+	{
+		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+		{
+			*buf = ReadBufferWithoutRelcache(rnode, forknum,
+											 blkno, mode, NULL);
+			return BLK_DONE;
+		}
+		else
+		{
+			*buf = InvalidBuffer;
+			return BLK_DONE;
+		}
+	}
+
 	/*
 	 * Make sure that if the block is marked with WILL_INIT, the caller is
 	 * going to initialize it. And vice versa.
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index e58e24a6465..51c9dfedabc 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -205,6 +205,10 @@ main(int argc, char *argv[])
 		PostgresMain(argc, argv,
 					 NULL,		/* no dbname */
 					 strdup(get_user_name_or_exit(progname)));	/* does not return */
+	else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0)
+		WalRedoMain(argc, argv,
+					 NULL,		/* no dbname */
+					 strdup(get_user_name_or_exit(progname)));	/* does not return */
 	else
 		PostmasterMain(argc, argv); /* does not return */
 	abort();					/* should not get here */
diff --git a/src/backend/tcop/Makefile b/src/backend/tcop/Makefile
index f662a7dd1cf..84f027436a4 100644
--- a/src/backend/tcop/Makefile
+++ b/src/backend/tcop/Makefile
@@ -20,4 +20,6 @@ OBJS = \
 	pquery.o \
 	utility.o
 
+OBJS += zenith_wal_redo.o
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
new file mode 100644
index 00000000000..4503648fc3e
--- /dev/null
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -0,0 +1,647 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenith_wal_redo.c
+ *	  Entry point for WAL redo helper
+ *
+ *
+ * This file contains an alternative main() function for the 'postgres'
+ * binary. In the special mode, we go into a special mode that's similar
+ * to the single user mode. We don't launch postmaster or any auxiliary
+ * processes. Instead, we wait for command from 'stdin', and respond to
+ * 'stdout'.
+ *
+ * There's a TAP test for this in contrib/zenith_store/t/002_wal_redo_helper.pl
+ *
+ * The protocol through stdin/stdout is loosely based on the libpq protocol.
+ * The process accepts messages through stdin, and each message has the format:
+ *
+ * char   msgtype;
+ * int32  length; // length of message including 'length' but excluding
+ *                // 'msgtype', in network byte order
+ * <payload>
+ *
+ * There are three message types:
+ *
+ * BeginRedoForBlock ('B'): Prepare for WAL replay for given block
+ * PushPage ('P'): Copy a page image (in the payload) to buffer cache
+ * ApplyRecord ('A'): Apply a WAL record (in the payload)
+ * GetPage ('G'): Return a page image from buffer cache.
+ *
+ * Currently, you only get a response to GetPage requests; the response is
+ * simply a 8k page, without any headers. Errors are logged to stderr.
+ *
+ * FIXME:
+ * - this currently requires a valid PGDATA, and creates a lock file there
+ *   like a normal postmaster. There's no fundamental reason for that, though.
+ * - should have EndRedoForBlock, and flush page cache, to allow using this
+ *   mechanism for more than one block without restarting the process.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/tcop/zenith_wal_redo.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#ifndef HAVE_GETRUSAGE
+#include "rusagestub.h"
+#endif
+
+#include "access/xlog.h"
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "storage/ipc.h"
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+
+static int	ReadRedoCommand(StringInfo inBuf);
+static void BeginRedoForBlock(StringInfo input_message);
+static void PushPage(StringInfo input_message);
+static void ApplyRecord(StringInfo input_message);
+static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
+static void GetPage(StringInfo input_message);
+
+static BufferTag target_redo_tag;
+
+#define TRACE DEBUG5
+
+/* ----------------------------------------------------------------
+ * FIXME comment
+ * PostgresMain
+ *	   postgres main loop -- all backends, interactive or otherwise start here
+ *
+ * argc/argv are the command line arguments to be used.  (When being forked
+ * by the postmaster, these are not the original argv array of the process.)
+ * dbname is the name of the database to connect to, or NULL if the database
+ * name should be extracted from the command line arguments or defaulted.
+ * username is the PostgreSQL user name to be used for the session.
+ * ----------------------------------------------------------------
+ */
+void
+WalRedoMain(int argc, char *argv[],
+			const char *dbname,
+			const char *username)
+{
+	int			firstchar;
+	StringInfoData input_message;
+
+	/* Initialize startup process environment if necessary. */
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/*
+	 * Parse command-line options.
+	 * TODO
+	 */
+	//process_postgres_switches(argc, argv, PGC_POSTMASTER, &dbname);
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		proc_exit(1);
+
+	/*
+	 * Set up signal handlers.  (InitPostmasterChild or InitStandaloneProcess
+	 * has already set up BlockSig and made that the active signal mask.)
+	 *
+	 * Note that postmaster blocked all signals before forking child process,
+	 * so there is no race condition whereby we might receive a signal before
+	 * we have set up the handler.
+	 *
+	 * Also note: it's best not to use any signals that are SIG_IGNored in the
+	 * postmaster.  If such a signal arrives before we are able to change the
+	 * handler to non-SIG_IGN, it'll get dropped.  Instead, make a dummy
+	 * handler in the postmaster to reserve the signal. (Of course, this isn't
+	 * an issue for signals that are locally generated, such as SIGALRM and
+	 * SIGPIPE.)
+	 */
+#if 0
+	if (am_walsender)
+		WalSndSignals();
+	else
+	{
+		pqsignal(SIGHUP, SignalHandlerForConfigReload);
+		pqsignal(SIGINT, StatementCancelHandler);	/* cancel current query */
+		pqsignal(SIGTERM, die); /* cancel current query and exit */
+
+		/*
+		 * In a postmaster child backend, replace SignalHandlerForCrashExit
+		 * with quickdie, so we can tell the client we're dying.
+		 *
+		 * In a standalone backend, SIGQUIT can be generated from the keyboard
+		 * easily, while SIGTERM cannot, so we make both signals do die()
+		 * rather than quickdie().
+		 */
+		if (IsUnderPostmaster)
+			pqsignal(SIGQUIT, quickdie);	/* hard crash time */
+		else
+			pqsignal(SIGQUIT, die); /* cancel current query and exit */
+		InitializeTimeouts();	/* establishes SIGALRM handler */
+
+		/*
+		 * Ignore failure to write to frontend. Note: if frontend closes
+		 * connection, we will notice it and exit cleanly when control next
+		 * returns to outer loop.  This seems safer than forcing exit in the
+		 * midst of output during who-knows-what operation...
+		 */
+		pqsignal(SIGPIPE, SIG_IGN);
+		pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+		pqsignal(SIGUSR2, SIG_IGN);
+		pqsignal(SIGFPE, FloatExceptionHandler);
+
+		/*
+		 * Reset some signals that are accepted by postmaster but not by
+		 * backend
+		 */
+		pqsignal(SIGCHLD, SIG_DFL); /* system() requires this on some
+									 * platforms */
+	}
+#endif
+
+	/*
+	 * Validate we have been given a reasonable-looking DataDir and change into it.
+	 */
+	checkDataDir();
+	ChangeToDataDir();
+
+	/*
+	 * Create lockfile for data directory.
+	 */
+	CreateDataDirLockFile(false);
+
+	/* read control file (error checking and contains config ) */
+	LocalProcessControlFile(false);
+
+	process_shared_preload_libraries();
+
+	/* Initialize MaxBackends (if under postmaster, was done already) */
+	InitializeMaxBackends();
+
+	/* Early initialization */
+	BaseInit();
+
+	/*
+	 * Create a per-backend PGPROC struct in shared memory. We must do
+	 * this before we can use LWLocks.
+	 */
+	InitAuxiliaryProcess();
+
+	SetProcessingMode(NormalProcessing);
+
+	/* Redo routines won't work if we're not "in recovery" */
+	InRecovery = true;
+
+	/*
+	 * Create the memory context we will use in the main loop.
+	 *
+	 * MessageContext is reset once per iteration of the main loop, ie, upon
+	 * completion of processing of each command message from the client.
+	 */
+	MessageContext = AllocSetContextCreate(TopMemoryContext,
+										   "MessageContext",
+										   ALLOCSET_DEFAULT_SIZES);
+
+	/* we need a ResourceOwner to hold buffer pins */
+	Assert(CurrentResourceOwner == NULL);
+	CurrentResourceOwner = ResourceOwnerCreate(NULL, "wal redo");
+
+	/* Initialize resource managers */
+	for (int rmid = 0; rmid <= RM_MAX_ID; rmid++)
+	{
+		if (RmgrTable[rmid].rm_startup != NULL)
+			RmgrTable[rmid].rm_startup();
+	}
+
+	/*
+	 * Main processing loop
+	 */
+	for (;;)
+	{
+		/*
+		 * Release storage left over from prior query cycle, and create a new
+		 * query input buffer in the cleared MessageContext.
+		 */
+		MemoryContextSwitchTo(MessageContext);
+		MemoryContextResetAndDeleteChildren(MessageContext);
+
+		initStringInfo(&input_message);
+
+		set_ps_display("idle");
+
+		/*
+		 * (3) read a command (loop blocks here)
+		 */
+		firstchar = ReadRedoCommand(&input_message);
+
+		switch (firstchar)
+		{
+			case 'B':			/* BeginRedoForBlock */
+				BeginRedoForBlock(&input_message);
+				break;
+
+			case 'P':			/* PushPage */
+				PushPage(&input_message);
+				break;
+
+			case 'A':			/* ApplyRecord */
+				ApplyRecord(&input_message);
+				break;
+
+			case 'G':			/* GetPage */
+				GetPage(&input_message);
+				break;
+
+				/*
+				 * EOF means we're done. Perform normal shutdown.
+				 */
+			case EOF:
+
+				/*
+				 * NOTE: if you are tempted to add more code here, DON'T!
+				 * Whatever you had in mind to do should be set up as an
+				 * on_proc_exit or on_shmem_exit callback, instead. Otherwise
+				 * it will fail to be called during other backend-shutdown
+				 * scenarios.
+				 */
+				proc_exit(0);
+
+			default:
+				ereport(FATAL,
+						(errcode(ERRCODE_PROTOCOL_VIOLATION),
+						 errmsg("invalid frontend message type %d",
+								firstchar)));
+		}
+	}							/* end of input-reading loop */
+}
+
+/*
+ * Some debug function that may be handy for now.
+ */
+pg_attribute_unused()
+static char *
+pprint_buffer(char *data, int len)
+{
+	StringInfoData s;
+	initStringInfo(&s);
+	appendStringInfo(&s, "\n");
+	for (int i = 0; i < len; i++) {
+
+		appendStringInfo(&s, "%02x ", (*(((char *) data) + i) & 0xff) );
+		if (i % 32 == 31) {
+			appendStringInfo(&s, "\n");
+		}
+	}
+	appendStringInfo(&s, "\n");
+
+	return s.data;
+}
+
+static char *
+pprint_tag(BufferTag *tag)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	appendStringInfo(&s, "%u/%u/%u.%d blk %u",
+		tag->rnode.spcNode,
+		tag->rnode.dbNode,
+		tag->rnode.relNode,
+		tag->forkNum,
+		tag->blockNum
+	);
+
+	return s.data;
+}
+/* ----------------------------------------------------------------
+ *		routines to obtain user input
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * Read next command from the client.
+ *
+ *	the string entered by the user is placed in its parameter inBuf,
+ *	and we act like a Q message was received.
+ *
+ *	EOF is returned if end-of-file input is seen; time to shut down.
+ * ----------------
+ */
+
+/*
+ * Wait until there is data in stdin. Prints a log message every 10 s whil
+ * waiting.
+ */
+static void
+wait_with_timeout(void)
+{
+	for (;;)
+	{
+		struct timeval timeout = {10, 0};
+		fd_set		fds;
+		int			ret;
+
+		FD_ZERO(&fds);
+		FD_SET(STDIN_FILENO, &fds);
+
+		ret = select(1, &fds, NULL, NULL, &timeout);
+		if (ret != 0)
+			break;
+		elog(DEBUG1, "still alive");
+	}
+}
+
+static int
+ReadRedoCommand(StringInfo inBuf)
+{
+	char		c;
+	int			qtype;
+	int32		len;
+	int			nread;
+
+	/* FIXME: Use unbuffered I/O here, because the WAL redo process was getting
+	 * stuck with buffered I/O. I'm not sure why, or whether the bug was somewhere
+	 * in here or in the calling page server side.
+	 */
+	wait_with_timeout();
+	if (read(STDIN_FILENO, &c, 1) == 0)
+		return EOF;
+	qtype = c;
+
+	/*
+	 * Like in the FE/BE protocol, all messages have a length word next
+	 * after the type code; we can read the message contents independently of
+	 * the type.
+	 */
+	if (read(STDIN_FILENO, &len, 4) != 4)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("could not read message length")));
+	}
+
+	len = pg_ntoh32(len);
+
+	if (len < 4)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("invalid message length")));
+		return EOF;
+	}
+
+	len -= 4;					/* discount length itself */
+
+	enlargeStringInfo(inBuf, len);
+	nread = 0;
+	while (nread < len) {
+		int n = read(STDIN_FILENO, inBuf->data + nread, len - nread);
+		if (n == -1)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("read error: %m")));
+		if (n == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("unexpected EOF")));
+		nread += n;
+	}
+	inBuf->len = len;
+	inBuf->data[len] = '\0';
+
+	return qtype;
+}
+
+
+/*
+ * Prepare for WAL replay on given block
+ */
+static void
+BeginRedoForBlock(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	MemoryContext oldcxt;
+	SMgrRelation reln;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+
+	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
+
+	{
+		char* buf = pprint_tag(&target_redo_tag);
+		elog(TRACE, "BeginRedoForBlock %s", buf);
+		pfree(buf);
+	}
+
+	MemoryContextSwitchTo(oldcxt);
+
+	reln = smgropen(rnode, InvalidBackendId);
+	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
+		reln->smgr_cached_nblocks[forknum] < blknum + 1)
+	{
+		reln->smgr_cached_nblocks[forknum] = blknum + 1;
+	}
+}
+
+/*
+ * Receive a page given by the client, and put it into buffer cache.
+ */
+static void
+PushPage(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	const char *content;
+	Buffer		buf;
+	Page		page;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 * 8k page content
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+	content = pq_getmsgbytes(input_message, BLCKSZ);
+
+	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL);
+	page = BufferGetPage(buf);
+	memcpy(page, content, BLCKSZ);
+	MarkBufferDirty(buf); /* pro forma */
+	UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Receive a WAL record, and apply it.
+ *
+ * All the pages should be loaded into the buffer cache by PushPage calls already.
+ */
+static void
+ApplyRecord(StringInfo input_message)
+{
+	/* recovery here */
+	char	   *errormsg;
+	XLogRecPtr	lsn;
+	XLogRecord *record;
+	int			nleft;
+	XLogReaderState reader_state;
+
+	/*
+	 * message format:
+	 *
+	 * LSN (the *end* of the record)
+	 * record
+	 */
+	lsn = pq_getmsgint64(input_message);
+
+	/* note: the input must be aligned here */
+	record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
+
+	nleft = input_message->len - input_message->cursor;
+	if (record->xl_tot_len != sizeof(XLogRecord) + nleft)
+		elog(ERROR, "mismatch between record (%d) and message size (%d)",
+			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
+
+	/* FIXME: use XLogReaderAllocate() */
+	memset(&reader_state, 0, sizeof(XLogReaderState));
+	reader_state.ReadRecPtr = 0; /* no 'prev' record */
+	reader_state.EndRecPtr = lsn; /* this record */
+	reader_state.decoded_record = record;
+	reader_state.errormsg_buf = palloc(1000 + 1); /* MAX_ERRORMSG_LEN */
+
+	if (!DecodeXLogRecord(&reader_state, record, &errormsg))
+		elog(ERROR, "failed to decode WAL record: %s", errormsg);
+
+	/* Ignore any other blocks than the ones the caller is interested in */
+	redo_read_buffer_filter = redo_block_filter;
+
+	RmgrTable[record->xl_rmid].rm_redo(&reader_state);
+
+	redo_read_buffer_filter = NULL;
+
+	elog(TRACE, "applied WAL record with LSN %X/%X",
+		 (uint32) (lsn >> 32), (uint32) lsn);
+}
+
+static bool
+redo_block_filter(XLogReaderState *record, uint8 block_id)
+{
+	BufferTag	target_tag;
+
+	if (!XLogRecGetBlockTag(record, block_id,
+							&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum))
+	{
+		/* Caller specified a bogus block_id */
+		elog(PANIC, "failed to locate backup block with ID %d", block_id);
+	}
+
+	/*
+	 * If this block isn't one we are currently restoring, then return 'true'
+	 * so that this gets ignored
+	 */
+	return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
+}
+
+/*
+ * Get a page image back from buffer cache.
+ *
+ * After applying some records.
+ */
+static void
+GetPage(StringInfo input_message)
+{
+	RelFileNode rnode;
+	ForkNumber forknum;
+	BlockNumber blknum;
+	Buffer		buf;
+	Page		page;
+
+	/*
+	 * message format:
+	 *
+	 * spcNode
+	 * dbNode
+	 * relNode
+	 * ForkNumber
+	 * BlockNumber
+	 */
+	forknum = pq_getmsgbyte(input_message);
+	rnode.spcNode = pq_getmsgint(input_message, 4);
+	rnode.dbNode = pq_getmsgint(input_message, 4);
+	rnode.relNode = pq_getmsgint(input_message, 4);
+	blknum = pq_getmsgint(input_message, 4);
+
+	/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
+
+	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_NORMAL, NULL);
+	page = BufferGetPage(buf);
+	/* single thread, so don't bother locking the page */
+
+	/* Response: Page content */
+	fwrite(page, 1, BLCKSZ, stdout); /* FIXME: check errors */
+	fflush(stdout);
+
+	ReleaseBuffer(buf);
+	DropDatabaseBuffers(rnode.dbNode);
+	smgrinit(); //reset inmem smgr state
+
+	elog(TRACE, "Page sent back for block %u", blknum);
+}
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 9ac602b674d..7cebdf3af6d 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -33,6 +33,8 @@ typedef enum
 								 * need to be replayed) */
 } XLogRedoAction;
 
+extern bool	(*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
+
 extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record,
 											uint8 buffer_id, Buffer *buf);
 extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id);
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 968345404e5..9da6e8768ab 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -88,4 +88,8 @@ extern bool set_plan_disabling_options(const char *arg,
 									   GucContext context, GucSource source);
 extern const char *get_stats_option_name(const char *arg);
 
+extern void WalRedoMain(int argc, char *argv[],
+						const char *dbname,
+						const char *username);
+
 #endif							/* TCOPPROT_H */

From 4783dfc899624c52c4dcf476b3ccad4401cd5b71 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:20:37 +0300
Subject: [PATCH 004/167] lastWrittenPageLSN.patch

Save lastWrittenPageLSN in XLogCtlData to know what pages to request from remote pageserver.

Authors:
Konstantin Knizhnik <knizhnik@garret.ru>
Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/access/transam/xlog.c | 30 ++++++++++++++++++++++++++++++
 src/backend/commands/dbcommands.c |  7 ++++---
 src/include/access/xlog.h         |  3 +++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 34d83f3e702..83b6e8c7084 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -740,6 +740,7 @@ typedef struct XLogCtlData
 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
+	XLogRecPtr  lastWrittenPageLSN;
 
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
@@ -8638,6 +8639,35 @@ GetInsertRecPtr(void)
 	return recptr;
 }
 
+/*
+ * GetLastWrittenPageLSN -- Returns maximal LSN of written page
+ */
+XLogRecPtr
+GetLastWrittenPageLSN(void)
+{
+	XLogRecPtr lsn;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lsn = XLogCtl->lastWrittenPageLSN;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return lsn;
+}
+
+/*
+ * SetLastWrittenPageLSN -- Set maximal LSN of written page
+ */
+void
+SetLastWrittenPageLSN(XLogRecPtr lsn)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	if (lsn > XLogCtl->lastWrittenPageLSN)
+		XLogCtl->lastWrittenPageLSN = lsn;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
+
+
+
 /*
  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
  * position known to be fsync'd to disk.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 2b159b60ebb..405643ee4c3 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -663,7 +663,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			/* Record the filesystem change in XLOG */
 			{
 				xl_dbase_create_rec xlrec;
-
+				XLogRecPtr lsn;
 				xlrec.db_id = dboid;
 				xlrec.tablespace_id = dsttablespace;
 				xlrec.src_db_id = src_dboid;
@@ -672,8 +672,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 				XLogBeginInsert();
 				XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_rec));
 
-				(void) XLogInsert(RM_DBASE_ID,
-								  XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
+				lsn = XLogInsert(RM_DBASE_ID,
+								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
+				SetLastWrittenPageLSN(lsn);
 			}
 		}
 		table_endscan(scan);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ee3e369b79f..8b8b14d2fd0 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -350,6 +350,9 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
+extern XLogRecPtr GetLastWrittenPageLSN(void);
+
 extern bool PromoteIsTriggered(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);

From 0fd9474dc78dd0e3b9d8d5d36e0db410843ef626 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 21 May 2021 23:57:08 +0300
Subject: [PATCH 005/167] Fix GetPage requests right after replaying CREATE
 DATABASE

In the test_createdb test, we created a new database, and created a new
branch after that. I was seeing the test fail with:

    PANIC:  could not open critical system index 2662

The WAL contained records like this:

    rmgr: XLOG        len (rec/tot):     49/  8241, tx:          0, lsn: 0/0163E8F0, prev 0/0163C8A0, desc: FPI , blkref #0: rel 1663/12985/1249 fork fsm blk 1 FPW
    rmgr: XLOG        len (rec/tot):     49/  8241, tx:          0, lsn: 0/01640940, prev 0/0163E8F0, desc: FPI , blkref #0: rel 1663/12985/1249 fork fsm blk 2 FPW
    rmgr: Standby     len (rec/tot):     54/    54, tx:          0, lsn: 0/01642990, prev 0/01640940, desc: RUNNING_XACTS nextXid 541 latestCompletedXid 539 oldestRunningXid 540; 1 xacts: 540
    rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/016429C8, prev 0/01642990, desc: CHECKPOINT_ONLINE redo 0/163C8A0; tli 1; prev tli 1; fpw true; xid 0:541; oid 24576; multi 1; offset 0; oldest xid 532 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 540; online
    rmgr: Database    len (rec/tot):     42/    42, tx:        540, lsn: 0/01642A40, prev 0/016429C8, desc: CREATE copy dir 1663/1 to 1663/16390
    rmgr: Standby     len (rec/tot):     54/    54, tx:          0, lsn: 0/01642A70, prev 0/01642A40, desc: RUNNING_XACTS nextXid 541 latestCompletedXid 539 oldestRunningXid 540; 1 xacts: 540
    rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/01642AA8, prev 0/01642A70, desc: CHECKPOINT_ONLINE redo 0/1642A70; tli 1; prev tli 1; fpw true; xid 0:541; oid 24576; multi 1; offset 0; oldest xid 532 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 540; online
    rmgr: Transaction len (rec/tot):     66/    66, tx:        540, lsn: 0/01642B20, prev 0/01642AA8, desc: COMMIT 2021-05-21 15:55:46.363728 EEST; inval msgs: catcache 21; sync
    rmgr: XLOG        len (rec/tot):    114/   114, tx:          0, lsn: 0/01642B68, prev 0/01642B20, desc: CHECKPOINT_SHUTDOWN redo 0/1642B68; tli 1; prev tli 1; fpw true; xid 0:541; oid 24576; multi 1; offset 0; oldest xid 532 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 0; shutdown

The compute node had correctly replayed all the WAL up to the last
record, and opened up. But when you tried to connect to the new
database, the very first requests for the critical relations, like
pg_class, were made with request LSN 0/01642990. That's the last
record that's applicable to a particular block. Because the database
CREATE record didn't bump up the "last written LSN", the getpage
requests were made with too old LSN.

I fixed this by adding a SetLastWrittenLSN() call to the redo of
database CREATE record. It probably wouldn't hurt to also throw in a
call at the end of WAL replay, but let's see if we bump into more
cases like this first.

This doesn't seem to be happening with page server as of 'main'; I was
testing with a version where I had temporarily reverted all the recent
changes to reconstruct control file, checkpoints, relmapper files
etc. from the WAL records in the page server, so that the compute node
was redoing all the WAL. I'm pretty sure we need this fix even with
'main', even though this test case wasn't failing there right now.
---
 src/backend/commands/dbcommands.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 405643ee4c3..509e482c355 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -2217,6 +2217,16 @@ dbase_redo(XLogReaderState *record)
 		 * We don't need to copy subdirectories
 		 */
 		copydir(src_path, dst_path, false);
+
+		/*
+		 * Make sure any future requests to the page server see the new
+		 * database.
+		 */
+		{
+			XLogRecPtr	lsn = record->EndRecPtr;
+
+			SetLastWrittenPageLSN(lsn);
+		}
 	}
 	else if (info == XLOG_DBASE_DROP)
 	{

From c178d58a048c7c77eaaccb72b51dcf776db48fec Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:26:04 +0300
Subject: [PATCH 006/167] handle_eviction_of_non_wal_logged_pages.patch

Some operations in PostgreSQL are not WAL-logged at all (i.e. hint bits)
or delay wal-logging till the end of operation (i.e. index build).
So if such page is evicted, we will lose the update.

To fix it, we introduce PD_WAL_LOGGED bit to track whether the page was wal-logged.
If the page is evicted before it has been wal-logged, then zenith smgr creates FPI for it.

Authors:
Konstantin Knizhnik <knizhnik@garret.ru>
anastasia <lubennikovaav@gmail.com>
---
 src/backend/access/common/bufmask.c     |  2 ++
 src/backend/access/gist/gistutil.c      |  2 ++
 src/backend/access/transam/xloginsert.c | 15 ++++++++++++++-
 src/backend/storage/buffer/bufmgr.c     | 14 ++++++++++++++
 src/backend/storage/page/bufpage.c      |  2 +-
 src/include/storage/bufpage.h           | 19 ++++++++++++++++++-
 6 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index 003a0befb25..e042cfdef92 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -54,6 +54,8 @@ mask_page_hint_bits(Page page)
 	PageClearFull(page);
 	PageClearHasFreeLinePointers(page);
 
+	phdr->pd_flags &= ~PD_WAL_LOGGED;
+
 	/*
 	 * During replay, if the page LSN has advanced past our XLOG record's LSN,
 	 * we don't mark the page all-visible. See heap_xlog_visible() for
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 43ba03b6eb9..1a1bb4a53f6 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -866,6 +866,8 @@ gistNewBuffer(Relation r)
 				if (XLogStandbyInfoActive() && RelationNeedsWAL(r))
 					gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page));
 
+				((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
+
 				return buffer;
 			}
 
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index b153fad594d..4622325901b 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -239,6 +239,7 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
+	((PageHeader)regbuf->page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -294,6 +295,7 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
+	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -1181,7 +1183,18 @@ log_newpage_range(Relation rel, ForkNumber forkNum,
 			MarkBufferDirty(bufpack[i]);
 		}
 
-		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+		/*
+		 * Zenith forces WAL logging of evicted pages,
+		 * so it can happen that in some cases when pages are first
+		 * modified and then WAL logged (for example building GiST/GiN
+		 * indexes) there are no more pages which need to be WAL logged at
+		 * the end of build procedure. As far as XLogInsert throws error
+		 * if not records were inserted, we need to reset the insert state.
+		 */
+		if (nbufs > 0)
+			recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
+		else
+			XLogResetInsertion();
 
 		for (i = 0; i < nbufs; i++)
 		{
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index ffc6056c60c..f0518f9ecc4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1615,6 +1615,11 @@ MarkBufferDirty(Buffer buffer)
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
+	/*
+	 * Clear PD_WAL_LOGGED flag so that if dirty page is evicted from page pool
+	 * before been WAL logged, FPI WAL record will be enforced.
+	 */
+	((PageHeader)BufferGetPage(buffer))->pd_flags &= ~PD_WAL_LOGGED;
 }
 
 /*
@@ -2004,6 +2009,15 @@ BufferSync(int flags)
 			item->blockNum = bufHdr->tag.blockNum;
 		}
 
+		/* Zenith XXX
+		 * Consider marking this page as not WAL-logged,
+		 * so that pagestore_smgr issued a log record before eviction
+		 * and persisted hint changes.
+		 * TODO: check performance impacts of this approach
+		 * since extra wal-logging may worsen the performance.
+		 */
+		//((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
+
 		UnlockBufHdr(bufHdr, buf_state);
 
 		/* Check for barrier events in case NBuffers is large. */
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 82ca91f5977..48dc7bde265 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -427,7 +427,7 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 
 	pageSize = PageGetPageSize(tempPage);
 	memcpy((char *) oldPage, (char *) tempPage, pageSize);
-
+	((PageHeader)oldPage)->pd_flags &= ~PD_WAL_LOGGED;
 	pfree(tempPage);
 }
 
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index c86ccdaf608..6704f69f328 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -182,7 +182,24 @@ typedef PageHeaderData *PageHeader;
 #define PD_ALL_VISIBLE		0x0004	/* all tuples on page are visible to
 									 * everyone */
 
-#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
+/* Zenith XXX:
+ * Some operations in PostgreSQL are not WAL-logged at all (i.e. hint bits)
+ * or delay wal-logging till the end of operation (i.e. index build).
+ *
+ * So if such page is evicted, we will lose the update.
+ * To fix it, we introduce PD_WAL_LOGGED bit to track whether the page was wal-logged.
+ * If page is evicted before it has been wal-logged, then pagestore_smgr creates FPI for it.
+ *
+ * List of such operations:
+ * - GIN/GiST/SP-GiST index build
+ * - page and heaptuple hint bits
+ * - Clearing visibility map bits
+ * - FSM changes
+ * - ???
+ */
+#define PD_WAL_LOGGED       0x0008  /* Page is wal-logged */
+#define PD_VALID_FLAG_BITS	0x000F	/* OR of all valid pd_flags bits */
+
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.

From 3ee8149713759cc8b03f825d1645b7ecdd42ae2d Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:33:46 +0300
Subject: [PATCH 007/167] [walproposer] wal_proposer.patch

Add WalProposer background worker to broadcast WAL stream to Zenith WAL acceptors

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/postmaster/bgworker.c           |   4 +
 src/backend/postmaster/postmaster.c         |   6 +
 src/backend/replication/Makefile            |   4 +-
 src/backend/replication/walproposer.c       | 873 ++++++++++++++++++++
 src/backend/replication/walproposer_utils.c | 237 ++++++
 src/backend/replication/walsender.c         | 103 ++-
 src/backend/utils/misc/guc.c                |  25 +
 src/include/replication/walproposer.h       | 174 ++++
 8 files changed, 1399 insertions(+), 27 deletions(-)
 create mode 100644 src/backend/replication/walproposer.c
 create mode 100644 src/backend/replication/walproposer_utils.c
 create mode 100644 src/include/replication/walproposer.h

diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index c40410d73ea..2be49df0eb0 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -22,6 +22,7 @@
 #include "postmaster/postmaster.h"
 #include "replication/logicallauncher.h"
 #include "replication/logicalworker.h"
+#include "replication/walproposer.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/latch.h"
@@ -128,6 +129,9 @@ static const struct
 	},
 	{
 		"ApplyWorkerMain", ApplyWorkerMain
+	},
+	{
+		"WalProposerMain", WalProposerMain
 	}
 };
 
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 581d552acf7..6ce609384e7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -117,6 +117,7 @@
 #include "postmaster/syslogger.h"
 #include "replication/logicallauncher.h"
 #include "replication/walsender.h"
+#include "replication/walproposer.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
@@ -997,6 +998,11 @@ PostmasterMain(int argc, char *argv[])
 	 */
 	ApplyLauncherRegister();
 
+	/*
+	 * Start WAL proposer bgworker is wal acceptors list is not empty
+	 */
+	WalProposerRegister();
+
 	/*
 	 * process any libraries that should be preloaded at postmaster start
 	 */
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index a0381e52f31..23731a07576 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -24,7 +24,9 @@ OBJS = \
 	syncrep_gram.o \
 	walreceiver.o \
 	walreceiverfuncs.o \
-	walsender.o
+	walsender.o \
+	walproposer.o \
+	walproposer_utils.o
 
 SUBDIRS = logical
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
new file mode 100644
index 00000000000..564defc024a
--- /dev/null
+++ b/src/backend/replication/walproposer.c
@@ -0,0 +1,873 @@
+/*-------------------------------------------------------------------------
+ *
+ * walproposer.c
+ *
+ * Broadcast WAL stream to Zenith WAL acceptetors
+ */
+#include <signal.h>
+#include <unistd.h>
+#include "replication/walproposer.h"
+#include "storage/latch.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "access/xlog.h"
+#include "replication/walreceiver.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "storage/pmsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/timestamp.h"
+
+char* wal_acceptors_list;
+int   wal_acceptor_reconnect_timeout;
+bool  am_wal_proposer;
+
+static int          n_walkeepers = 0;
+static int          quorum = 0;
+static WalKeeper    walkeeper[MAX_WALKEEPERS];
+static WalMessage*  msgQueueHead;
+static WalMessage*  msgQueueTail;
+static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
+static XLogRecPtr	lastSentVCLLsn;	/* VCL replies have been sent to walkeeper up to here */
+static ServerInfo   serverInfo;
+static WaitEventSet* waitEvents;
+static WalKeeperResponse lastFeedback;
+static XLogRecPtr   restartLsn; /* Last position received by all walkeepers. */
+static RequestVote  prop;       /* Vote request for walkeeper */
+static int          leader;     /* Most advanced walkeeper */
+static int          n_votes = 0;
+static int          n_connected = 0;
+static TimestampTz  last_reconnect_attempt;
+
+/*
+ * Combine hot standby feedbacks from all walkeepers.
+ */
+static void
+CombineHotStanbyFeedbacks(HotStandbyFeedback* hs)
+{
+	hs->ts = 0;
+	hs->xmin.value = ~0; /* largest unsigned value */
+	hs->catalog_xmin.value = ~0; /* largest unsigned value */
+
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.hs.ts != 0)
+		{
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
+			{
+				hs->xmin = walkeeper[i].feedback.hs.xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
+			{
+				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+		}
+	}
+}
+
+static void
+ResetWalProposerEventSet(void)
+{
+	if (waitEvents)
+		FreeWaitEventSet(waitEvents);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
+	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].sock != PGINVALID_SOCKET)
+		{
+			int events;
+			switch (walkeeper[i].state)
+			{
+				case SS_SEND_WAL:
+					events = WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE;
+					break;
+				case SS_CONNECTING:
+					events = WL_SOCKET_WRITEABLE;
+					break;
+				default:
+					events = WL_SOCKET_READABLE;
+					break;
+			}
+			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, events, walkeeper[i].sock, NULL, &walkeeper[i]);
+		}
+	}
+}
+
+/*
+ * This function is called to establish new connection or to reestablish connection in case
+ * of connection failure.
+ * Close current connection if any and try to initiate new one
+ */
+static void
+ResetConnection(int i)
+{
+	bool established;
+
+	if (walkeeper[i].state != SS_OFFLINE)
+	{
+		elog(WARNING, "Connection with node %s:%s failed: %m",
+			walkeeper[i].host, walkeeper[i].port);
+
+		/* Close old connection */
+		closesocket(walkeeper[i].sock);
+		walkeeper[i].sock = PGINVALID_SOCKET;
+		walkeeper[i].state = SS_OFFLINE;
+
+		/* Postgres wait event set API doesn't support deletion of events, so we have to reconstruct set */
+		ResetWalProposerEventSet();
+	}
+
+	/* Try to establish new connection */
+	walkeeper[i].sock = ConnectSocketAsync(walkeeper[i].host, walkeeper[i].port, &established);
+	if (walkeeper[i].sock != PGINVALID_SOCKET)
+	{
+		elog(LOG, "%s with node %s:%s",
+					established ? "Connected" : "Connecting", walkeeper[i].host, walkeeper[i].port);
+
+
+		if (established)
+		{
+			/* Start handshake: first of all send information about server */
+			if (WriteSocket(walkeeper[i].sock, &serverInfo, sizeof serverInfo))
+			{
+				walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_READABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
+				walkeeper[i].state = SS_HANDSHAKE;
+				walkeeper[i].asyncOffs = 0;
+			}
+			else
+			{
+				ResetConnection(i);
+			}
+		}
+		else
+		{
+			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
+			walkeeper[i].state = SS_CONNECTING;
+		}
+	}
+}
+
+
+/*
+ * Calculate WAL position acknowledged by quorum
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(void)
+{
+	XLogRecPtr responses[MAX_WALKEEPERS];
+	/*
+	 * Sort acknowledged LSNs
+	 */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		responses[i] = walkeeper[i].feedback.epoch == prop.epoch
+			? walkeeper[i].feedback.flushLsn : prop.VCL;
+	}
+	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
+
+	/*
+	 * Get the smallest LSN committed by quorum
+	 */
+	return responses[n_walkeepers - quorum];
+}
+
+static void
+HandleWalKeeperResponse(void)
+{
+	HotStandbyFeedback hsFeedback;
+	XLogRecPtr minQuorumLsn;
+
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastFeedback.flushLsn)
+	{
+		lastFeedback.flushLsn = minQuorumLsn;
+		ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+	}
+	CombineHotStanbyFeedbacks(&hsFeedback);
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
+	{
+		lastFeedback.hs = hsFeedback;
+		ProcessStandbyHSFeedback(hsFeedback.ts,
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+	}
+
+
+	/* Cleanup message queue */
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1))
+	{
+		WalMessage* msg = msgQueueHead;
+		msgQueueHead = msg->next;
+		if (restartLsn < msg->req.beginLsn)
+			restartLsn = msg->req.endLsn;
+		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(WalKeeperRequest));
+		free(msg);
+	}
+	if (!msgQueueHead) /* queue is empty */
+		msgQueueTail = NULL;
+}
+
+char *zenith_timeline_walproposer = NULL;
+
+/*
+ * WAL proposer bgworeker entry point
+ */
+void
+WalProposerMain(Datum main_arg)
+{
+	char* host;
+	char* sep;
+	char* port;
+
+	/* Establish signal handlers. */
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+		/* Load the libpq-specific functions */
+	load_file("libpqwalreceiver", false);
+	if (WalReceiverFunctions == NULL)
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+
+	load_file("zenith", false);
+
+	BackgroundWorkerUnblockSignals();
+
+	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
+	{
+		port = strchr(host, ':');
+		if (port == NULL) {
+			elog(FATAL, "port is not specified");
+		}
+		*port++ = '\0';
+		sep = strchr(port, ',');
+		if (sep != NULL)
+			*sep++ = '\0';
+		if (n_walkeepers+1 >= MAX_WALKEEPERS)
+		{
+			elog(FATAL, "Too many walkeepers");
+		}
+		walkeeper[n_walkeepers].host = host;
+		walkeeper[n_walkeepers].port = port;
+		walkeeper[n_walkeepers].state = SS_OFFLINE;
+		walkeeper[n_walkeepers].sock = PGINVALID_SOCKET;
+		walkeeper[n_walkeepers].currMsg = NULL;
+		n_walkeepers += 1;
+	}
+	if (n_walkeepers < 1)
+	{
+		elog(FATAL, "WalKeepers addresses are not specified");
+	}
+	quorum = n_walkeepers/2 + 1;
+
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	/* Fill information about server */
+	serverInfo.timeline = ThisTimeLineID;
+	serverInfo.walEnd = GetFlushRecPtr();
+	serverInfo.walSegSize = wal_segment_size;
+	serverInfo.pgVersion = PG_VERSION_NUM;
+	if (!zenith_timeline_walproposer)
+		elog(FATAL, "zenith.zenith_timeline is not provided");
+	if (*zenith_timeline_walproposer != '\0' &&
+	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+	serverInfo.protocolVersion = SK_PROTOCOL_VERSION;
+	pg_strong_random(&serverInfo.nodeId.uuid, sizeof(serverInfo.nodeId.uuid));
+	serverInfo.systemId = GetSystemIdentifier();
+
+	last_reconnect_attempt = GetCurrentTimestamp();
+
+	application_name = (char *) "walproposer"; /* for synchronous_standby_names */
+	am_wal_proposer = true;
+	am_walsender = true;
+	InitWalSender();
+	ResetWalProposerEventSet();
+
+	/* Initiate connections to all walkeeper nodes */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		ResetConnection(i);
+	}
+
+	while (true)
+		WalProposerPoll();
+}
+
+static void
+WalProposerStartStreaming(XLogRecPtr startpos)
+{
+	StartReplicationCmd cmd;
+	/*
+	 * Always start streaming at the beginning of a segment
+	 */
+	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
+
+	cmd.slotname = NULL;
+	cmd.timeline = serverInfo.timeline;
+	cmd.startpoint = startpos;
+	StartReplication(&cmd);
+}
+
+/*
+ * Send message to the particular node
+ */
+static void
+SendMessageToNode(int i, WalMessage* msg)
+{
+	ssize_t rc;
+
+	/* If there is no pending message then send new one */
+	if (walkeeper[i].currMsg == NULL)
+	{
+		/* Skip already acknowledged messages */
+		while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
+			msg = msg->next;
+
+		walkeeper[i].currMsg = msg;
+	}
+	else
+		msg = walkeeper[i].currMsg;
+
+	if (msg != NULL)
+	{
+		msg->req.restartLsn = restartLsn;
+		msg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
+
+		elog(LOG, "sending message with len %ld VCL=%X/%X to %d",
+					msg->size - sizeof(WalKeeperRequest),
+					(uint32) (msg->req.commitLsn >> 32), (uint32) msg->req.commitLsn, i);
+
+		rc = WriteSocketAsync(walkeeper[i].sock, &msg->req, msg->size);
+		if (rc < 0)
+		{
+			ResetConnection(i);
+		}
+		else if ((size_t)rc == msg->size) /* message was completely sent */
+		{
+			walkeeper[i].asyncOffs = 0;
+			walkeeper[i].state = SS_RECV_FEEDBACK;
+		}
+		else
+		{
+			/* wait until socket is available for write */
+			walkeeper[i].state = SS_SEND_WAL;
+			walkeeper[i].asyncOffs = rc;
+			ModifyWaitEvent(waitEvents, walkeeper[i].eventPos, WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE, NULL);
+		}
+	}
+}
+
+/*
+ * Broadcast new message to all caught-up walkeepers
+ */
+static void
+BroadcastMessage(WalMessage* msg)
+{
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE && walkeeper[i].currMsg == NULL)
+		{
+			SendMessageToNode(i, msg);
+		}
+	}
+}
+
+static WalMessage*
+CreateMessage(XLogRecPtr startpos, char* data, int len)
+{
+	/* Create new message and append it to message queue */
+	WalMessage*	msg;
+	XLogRecPtr endpos;
+	len -= XLOG_HDR_SIZE;
+	endpos = startpos + len;
+	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
+	{
+		/* Message already queued */
+		return NULL;
+	}
+	Assert(len >= 0);
+	msg = (WalMessage*)malloc(sizeof(WalMessage) + len);
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
+
+	msg->size = sizeof(WalKeeperRequest) + len;
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.beginLsn = startpos;
+	msg->req.endLsn = endpos;
+	msg->req.senderId = prop.nodeId;
+	memcpy(&msg->req+1, data + XLOG_HDR_SIZE, len);
+
+	Assert(msg->req.endLsn >= lastSentLsn);
+	lastSentLsn = msg->req.endLsn;
+	return msg;
+}
+
+void
+WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
+{
+	WalMessage* msg = CreateMessage(startpos, data, len);
+	if (msg != NULL)
+		BroadcastMessage(msg);
+}
+
+/*
+ * Create WAL message with no data, just to let the walkeepers
+ * know that the VCL has advanced.
+ */
+static WalMessage*
+CreateMessageVCLOnly(void)
+{
+	/* Create new message and append it to message queue */
+	WalMessage*	msg;
+
+	if (lastSentLsn == 0)
+	{
+		/* FIXME: We haven't sent anything yet. Not sure what to do then.. */
+		return NULL;
+	}
+
+	msg = (WalMessage*)malloc(sizeof(WalMessage));
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
+
+	msg->size = sizeof(WalKeeperRequest);
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.beginLsn = lastSentLsn;
+	msg->req.endLsn = lastSentLsn;
+	msg->req.senderId = prop.nodeId;
+	/* restartLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
+	return msg;
+}
+
+
+/*
+ * Prepare vote request for election
+ */
+static void
+StartElection(void)
+{
+	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
+	XLogRecPtr initWALPos = serverInfo.walSegSize;
+	prop.VCL = restartLsn = initWALPos;
+	prop.nodeId = serverInfo.nodeId;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_VOTING)
+		{
+			prop.nodeId.term = Max(walkeeper[i].info.server.nodeId.term, prop.nodeId.term);
+			restartLsn = Max(walkeeper[i].info.restartLsn, restartLsn);
+			if (walkeeper[i].info.epoch > prop.epoch
+				|| (walkeeper[i].info.epoch == prop.epoch && walkeeper[i].info.flushLsn > prop.VCL))
+
+			{
+				prop.epoch = walkeeper[i].info.epoch;
+				prop.VCL = walkeeper[i].info.flushLsn;
+				leader = i;
+			}
+		}
+	}
+	/* Only walkeepers from most recent epoch can report it's FlushLsn to master */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_VOTING)
+		{
+			if (walkeeper[i].info.epoch == prop.epoch)
+			{
+				walkeeper[i].feedback.flushLsn = walkeeper[i].info.flushLsn;
+			}
+			else
+			{
+				elog(WARNING, "WalKeeper %s:%s belongs to old epoch " INT64_FORMAT " while current epoch is " INT64_FORMAT,
+					walkeeper[i].host,
+					walkeeper[i].port,
+					walkeeper[i].info.epoch,
+					prop.epoch);
+			}
+		}
+	}
+	prop.nodeId.term += 1;
+	prop.epoch += 1;
+}
+
+
+static void
+ReconnectWalKeepers(void)
+{
+	/* Initiate reconnect if timeout is expired */
+	TimestampTz now = GetCurrentTimestamp();
+	if (wal_acceptor_reconnect_timeout > 0 && now - last_reconnect_attempt > wal_acceptor_reconnect_timeout*1000)
+	{
+		last_reconnect_attempt = now;
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			if (walkeeper[i].state == SS_OFFLINE)
+				ResetConnection(i);
+		}
+	}
+}
+
+/*
+ * Receive WAL from most advanced WAL keeper
+ */
+static bool
+WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+{
+	char conninfo[MAXCONNINFO];
+	char *err;
+	WalReceiverConn *wrconn;
+	WalRcvStreamOptions options;
+
+	sprintf(conninfo, "host=%s port=%s dbname=replication",
+			walkeeper[leader].host, walkeeper[leader].port);
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+	if (!wrconn)
+	{
+		ereport(WARNING,
+				(errmsg("could not connect to WAL acceptor %s:%s: %s",
+						walkeeper[leader].host, walkeeper[leader].port,
+						err)));
+		return false;
+	}
+	elog(LOG, "Start recovery from %s:%s starting from %X/%08X till %X/%08X timeline %d",
+		 walkeeper[leader].host, walkeeper[leader].port,
+		 (uint32)(startpos>>32), (uint32)startpos, (uint32)(endpos >> 32), (uint32)endpos,
+		 timeline);
+
+	options.logical = false;
+	options.startpoint = startpos;
+	options.slotname = NULL;
+	options.proto.physical.startpointTLI = timeline;
+
+	if (walrcv_startstreaming(wrconn, &options))
+	{
+		XLogRecPtr rec_start_lsn;
+		XLogRecPtr rec_end_lsn;
+		int len;
+		char *buf;
+		pgsocket wait_fd = PGINVALID_SOCKET;
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) > 0)
+		{
+			Assert(buf[0] == 'w');
+			memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn);
+			rec_start_lsn = pg_ntoh64(rec_start_lsn);
+			rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+			(void)CreateMessage(rec_start_lsn, buf, len);
+			if (rec_end_lsn >= endpos)
+				break;
+		}
+		walrcv_endstreaming(wrconn, &timeline);
+		walrcv_disconnect(wrconn);
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
+						timeline, (uint32)(startpos >> 32), (uint32)startpos)));
+		return false;
+	}
+	/* Setup restart point for all walkeepers */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE)
+		{
+			for (WalMessage* msg = msgQueueHead; msg != NULL; msg = msg->next)
+			{
+				if (msg->req.endLsn <= walkeeper[i].info.flushLsn)
+				{
+					msg->ackMask |= 1 << i; /* message is already received by this walkeeper */
+				}
+				else
+				{
+					SendMessageToNode(i, msg);
+					break;
+				}
+			}
+		}
+	}
+	return true;
+}
+
+void
+WalProposerPoll(void)
+{
+	while (true)
+	{
+		WaitEvent	event;
+		int rc = WaitEventSetWait(waitEvents, -1, &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		WalKeeper*  wk = (WalKeeper*)event.user_data;
+		int i = (int)(wk - walkeeper);
+
+		/* If wait is terminated by error, postmaster die or latch event, then exit loop */
+		if (rc <= 0 || (event.events & (WL_POSTMASTER_DEATH|WL_LATCH_SET)) != 0)
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+
+		/* communication with walkeepers */
+		if (event.events & WL_SOCKET_READABLE)
+		{
+			switch (wk->state)
+			{
+				case SS_HANDSHAKE:
+					/* Receive walkeeper node state */
+					rc = ReadSocketAsync(wk->sock,
+										 (char*)&wk->info + wk->asyncOffs,
+										 sizeof(wk->info) - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == sizeof(wk->info))
+					{
+						/* WalKeeper response completely received */
+
+						/* Check protocol version */
+						if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
+						{
+							elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
+								wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
+							ResetConnection(i);
+						}
+						else
+						{
+							wk->state = SS_VOTING;
+							wk->feedback.flushLsn = restartLsn;
+							wk->feedback.hs.ts = 0;
+
+							/* Check if we have quorum */
+							if (++n_connected >= quorum)
+							{
+								if (n_connected == quorum)
+									StartElection();
+
+								/* Now send max-node-id to everyone participating in voting and wait their responses */
+								for (int j = 0; j < n_walkeepers; j++)
+								{
+									if (walkeeper[j].state == SS_VOTING)
+									{
+										if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
+										{
+											ResetConnection(j);
+										}
+										else
+										{
+											walkeeper[j].asyncOffs = 0;
+											walkeeper[j].state = SS_WAIT_VERDICT;
+										}
+									}
+								}
+							}
+						}
+					}
+					break;
+
+				case SS_WAIT_VERDICT:
+					/* Receive walkeeper response for our candidate */
+					rc = ReadSocketAsync(wk->sock,
+										 (char*)&wk->info.server.nodeId + wk->asyncOffs,
+										 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
+					{
+						/* Response completely received */
+
+						/* If server accept our candidate, then it returns it in response */
+						if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+						{
+							elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+								wk->host, wk->port,
+								wk->info.server.nodeId.term, prop.nodeId.term);
+						}
+						else
+						{
+							/* Handshake completed, do we have quorum? */
+							wk->state = SS_IDLE;
+							if (++n_votes == quorum)
+							{
+								elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
+									 quorum,
+									 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
+									);
+
+								/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+								if (restartLsn != prop.VCL)
+								{
+									/* Perform recovery */
+									if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+										elog(FATAL, "Failed to recover state");
+								}
+								WalProposerStartStreaming(prop.VCL);
+								/* Should not return here */
+							}
+							else
+							{
+								/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+								SendMessageToNode(i, msgQueueHead);
+							}
+						}
+					}
+					break;
+
+			    case SS_RECV_FEEDBACK:
+					/* Read walkeeper response with flushed WAL position */
+				    rc = ReadSocketAsync(wk->sock,
+										 (char*)&wk->feedback + wk->asyncOffs,
+										 sizeof(wk->feedback) - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
+					{
+						WalMessage* next = wk->currMsg->next;
+						Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
+						wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+						wk->state = SS_IDLE;
+						wk->asyncOffs = 0;
+						wk->currMsg = NULL;
+						HandleWalKeeperResponse();
+						SendMessageToNode(i, next);
+
+						/*
+						 * Also send the new VCL to all the walkeepers.
+						 *
+						 * FIXME: This is redundant for walkeepers that have other outbound messages
+						 * pending.
+						 */
+						if (true)
+						{
+							XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+							WalMessage *vclUpdateMsg;
+
+							if (minQuorumLsn > lastSentVCLLsn)
+							{
+								vclUpdateMsg = CreateMessageVCLOnly();
+								if (vclUpdateMsg)
+									BroadcastMessage(vclUpdateMsg);
+								lastSentVCLLsn = minQuorumLsn;
+							}
+						}
+					}
+					break;
+
+				case SS_IDLE:
+					elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
+					ResetConnection(i);
+					break;
+
+				default:
+		  			elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+			}
+		}
+		else if (event.events & WL_SOCKET_WRITEABLE)
+		{
+			switch (wk->state)
+			{
+				case SS_CONNECTING:
+				{
+					int			optval = 0;
+					ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
+					if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
+					{
+						elog(WARNING, "Failed to connect to node '%s:%s': %s",
+							 wk->host, wk->port,
+							 strerror(optval));
+						closesocket(wk->sock);
+						wk->sock =  PGINVALID_SOCKET;
+						wk->state = SS_OFFLINE;
+						ResetWalProposerEventSet();
+					}
+					else
+					{
+						uint32 len = 0;
+						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+						/*
+						 * Start handshake: send information about server.
+						 * First of all send 0 as package size: it allows walkeeper to distinguish
+						 * wal_proposer's connection from standard replication connection from pagers.
+						 */
+						if (WriteSocket(wk->sock, &len, sizeof len)
+							&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
+						{
+							wk->state = SS_HANDSHAKE;
+							wk->asyncOffs = 0;
+						}
+						else
+						{
+							ResetConnection(i);
+						}
+					}
+					break;
+				}
+
+				case SS_SEND_WAL:
+					rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
+					if (rc < 0)
+					{
+						ResetConnection(i);
+					}
+					else if ((wk->asyncOffs += rc) == wk->currMsg->size)
+					{
+						/* WAL block completely sent */
+						wk->state = SS_RECV_FEEDBACK;
+						wk->asyncOffs = 0;
+						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+					}
+					break;
+
+				default:
+					elog(FATAL, "Unexpected write state %d", wk->state);
+			}
+		}
+		ReconnectWalKeepers();
+	}
+}
+
+
+/*
+ * WalProposerRegister
+ *		Register a background worker porposing WAL to wal acceptors
+ */
+void
+WalProposerRegister(void)
+{
+	BackgroundWorker bgw;
+
+	if (*wal_acceptors_list == '\0')
+		return;
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
new file mode 100644
index 00000000000..cea41ef01cc
--- /dev/null
+++ b/src/backend/replication/walproposer_utils.c
@@ -0,0 +1,237 @@
+#include "replication/walproposer.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+int CompareNodeId(NodeId* id1, NodeId* id2)
+{
+	return
+		(id1->term < id2->term)
+		? -1
+		: (id1->term > id2->term)
+		   ? 1
+   		   : memcmp(&id1->uuid, &id1->uuid, sizeof(pg_uuid_t));
+}
+
+int
+CompareLsn(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return -1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return 1;
+}
+
+static bool
+SetSocketOptions(pgsocket sock)
+{
+	int on = 1;
+	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+				   (char *) &on, sizeof(on)) < 0)
+	{
+		elog(WARNING, "setsockopt(TCP_NODELAY) failed: %m");
+		closesocket(sock);
+		return false;
+	}
+	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+				   (char *) &on, sizeof(on)) < 0)
+	{
+		elog(WARNING, "setsockopt(SO_REUSEADDR) failed: %m");
+		closesocket(sock);
+		return false;
+	}
+	if (!pg_set_noblock(sock))
+	{
+		elog(WARNING, "faied to switch socket to non-blocking mode: %m");
+		closesocket(sock);
+		return false;
+	}
+	return true;
+}
+
+pgsocket
+ConnectSocketAsync(char const* host, char const* port, bool* established)
+{
+	struct addrinfo *addrs = NULL,
+		*addr,
+		hints;
+	int	ret;
+	pgsocket sock = PGINVALID_SOCKET;
+
+	hints.ai_flags = AI_PASSIVE;
+	hints.ai_family = AF_UNSPEC;
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_protocol = 0;
+	hints.ai_addrlen = 0;
+	hints.ai_addr = NULL;
+	hints.ai_canonname = NULL;
+	hints.ai_next = NULL;
+	ret = pg_getaddrinfo_all(host, port, &hints, &addrs);
+	if (ret || !addrs)
+	{
+		elog(WARNING, "Could not resolve \"%s\": %s",
+					 host, gai_strerror(ret));
+		return -1;
+	}
+	for (addr = addrs; addr; addr = addr->ai_next)
+	{
+		sock = socket(addr->ai_family, SOCK_STREAM, 0);
+		if (sock == PGINVALID_SOCKET)
+		{
+			elog(WARNING, "could not create socket: %m");
+			continue;
+		}
+		if (!SetSocketOptions(sock))
+			continue;
+
+		/*
+		 * Bind it to a kernel assigned port on localhost and get the assigned
+		 * port via getsockname().
+		 */
+		while ((ret = connect(sock, addr->ai_addr, addr->ai_addrlen)) < 0 && errno == EINTR);
+		if (ret < 0)
+		{
+			if (errno == EINPROGRESS)
+			{
+				*established = false;
+				break;
+			}
+			elog(WARNING, "Could not establish connection to %s:%s: %m",
+						 host, port);
+			closesocket(sock);
+		}
+		else
+		{
+			*established = true;
+			break;
+		}
+	}
+	return sock;
+}
+ssize_t
+ReadSocketAsync(pgsocket sock, void* buf, size_t size)
+{
+	size_t offs = 0;
+
+	while (size != offs)
+	{
+		ssize_t rc = recv(sock, (char*)buf + offs, size - offs, 0);
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				continue;
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				return offs;
+			elog(WARNING, "Socket write failed: %m");
+			return -1;
+		}
+		else if (rc == 0)
+		{
+			elog(WARNING, "Connection was closed by peer");
+			return -1;
+		}
+		offs += rc;
+	}
+	return offs;
+}
+
+ssize_t
+WriteSocketAsync(pgsocket sock, void const* buf, size_t size)
+{
+	size_t offs = 0;
+
+	while (size != offs)
+	{
+		ssize_t rc = send(sock, (char const*)buf + offs, size - offs, 0);
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				continue;
+			if (errno == EAGAIN || errno == EWOULDBLOCK)
+				return offs;
+			elog(WARNING, "Socket write failed: %m");
+			return -1;
+		}
+		else if (rc == 0)
+		{
+			elog(WARNING, "Connection was closed by peer");
+			return -1;
+		}
+		offs += rc;
+	}
+	return offs;
+}
+
+bool
+WriteSocket(pgsocket sock, void const* buf, size_t size)
+{
+	char* src = (char*)buf;
+
+	while (size != 0)
+	{
+		ssize_t rc = send(sock, src, size, 0);
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				continue;
+			elog(WARNING, "Socket write failed: %m");
+			return false;
+		}
+		else if (rc == 0)
+		{
+			elog(WARNING, "Connection was closed by peer");
+			return false;
+		}
+		size -= rc;
+		src += rc;
+	}
+	return true;
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 3b245c619fc..c17dd98b3da 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -73,6 +73,7 @@
 #include "replication/slot.h"
 #include "replication/snapbuild.h"
 #include "replication/syncrep.h"
+#include "replication/walproposer.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "replication/walsender_private.h"
@@ -234,7 +235,7 @@ static XLogRecPtr GetStandbyFlushRecPtr(void);
 static void IdentifySystem(void);
 static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd);
 static void DropReplicationSlot(DropReplicationSlotCmd *cmd);
-static void StartReplication(StartReplicationCmd *cmd);
+void StartReplication(StartReplicationCmd *cmd);
 static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
@@ -567,7 +568,7 @@ SendTimeLineHistory(TimeLineHistoryCmd *cmd)
  * At the moment, this never returns, but an ereport(ERROR) will take us back
  * to the main loop.
  */
-static void
+void
 StartReplication(StartReplicationCmd *cmd)
 {
 	StringInfoData buf;
@@ -708,11 +709,14 @@ StartReplication(StartReplicationCmd *cmd)
 		WalSndSetState(WALSNDSTATE_CATCHUP);
 
 		/* Send a CopyBothResponse message, and start streaming */
-		pq_beginmessage(&buf, 'W');
-		pq_sendbyte(&buf, 0);
-		pq_sendint16(&buf, 0);
-		pq_endmessage(&buf);
-		pq_flush();
+		if (!am_wal_proposer)
+		{
+			pq_beginmessage(&buf, 'W');
+			pq_sendbyte(&buf, 0);
+			pq_sendint16(&buf, 0);
+			pq_endmessage(&buf);
+			pq_flush();
+		}
 
 		/*
 		 * Don't allow a request to stream from a future point in WAL that
@@ -1324,7 +1328,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 		}
 
 		/* Try to flush pending output to the client */
-		if (pq_flush_if_writable() != 0)
+		if (!am_wal_proposer && pq_flush_if_writable() != 0)
 			WalSndShutdown();
 	}
 
@@ -1716,6 +1720,9 @@ ProcessRepliesIfAny(void)
 	int			r;
 	bool		received = false;
 
+	if (am_wal_proposer)
+		return;
+
 	last_processing = GetCurrentTimestamp();
 
 	/*
@@ -1891,21 +1898,34 @@ ProcessStandbyReplyMessage(void)
 				flushPtr,
 				applyPtr;
 	bool		replyRequested;
-	TimeOffset	writeLag,
-				flushLag,
-				applyLag;
-	bool		clearLagTimes;
-	TimestampTz now;
 	TimestampTz replyTime;
 
-	static bool fullyAppliedLastTime = false;
-
 	/* the caller already consumed the msgtype byte */
 	writePtr = pq_getmsgint64(&reply_message);
 	flushPtr = pq_getmsgint64(&reply_message);
 	applyPtr = pq_getmsgint64(&reply_message);
 	replyTime = pq_getmsgint64(&reply_message);
 	replyRequested = pq_getmsgbyte(&reply_message);
+	ProcessStandbyReply(writePtr,
+						flushPtr,
+						applyPtr,
+						replyTime,
+						replyRequested);
+}
+
+void
+ProcessStandbyReply(XLogRecPtr	writePtr,
+					XLogRecPtr	flushPtr,
+					XLogRecPtr	applyPtr,
+					TimestampTz replyTime,
+					bool		replyRequested)
+{
+	TimeOffset	writeLag,
+				flushLag,
+				applyLag;
+	bool		clearLagTimes;
+	TimestampTz now;
+	static bool fullyAppliedLastTime = false;
 
 	if (message_level_is_interesting(DEBUG2))
 	{
@@ -2088,7 +2108,16 @@ ProcessStandbyHSFeedbackMessage(void)
 	feedbackEpoch = pq_getmsgint(&reply_message, 4);
 	feedbackCatalogXmin = pq_getmsgint(&reply_message, 4);
 	feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4);
+	ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch);
+}
 
+void
+ProcessStandbyHSFeedback(TimestampTz   replyTime,
+						 TransactionId feedbackXmin,
+						 uint32		feedbackEpoch,
+						 TransactionId feedbackCatalogXmin,
+						 uint32		feedbackCatalogEpoch)
+{
 	if (message_level_is_interesting(DEBUG2))
 	{
 		char	   *replyTimeStr;
@@ -2296,6 +2325,19 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		/* Check for input from the client */
 		ProcessRepliesIfAny();
 
+		if (am_wal_proposer)
+		{
+			send_data();
+			if (WalSndCaughtUp)
+			{
+				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+					WalSndSetState(WALSNDSTATE_STREAMING);
+				WalProposerPoll();
+				WalSndCaughtUp = false;
+			}
+			continue;
+		}
+
 		/*
 		 * If we have received CopyDone from the client, sent CopyDone
 		 * ourselves, and the output buffer is empty, it's time to exit
@@ -2757,9 +2799,12 @@ XLogSendPhysical(void)
 	/*
 	 * OK to read and send the slice.
 	 */
-	resetStringInfo(&output_message);
-	pq_sendbyte(&output_message, 'w');
+	if (output_message.data)
+		resetStringInfo(&output_message);
+	else
+		initStringInfo(&output_message);
 
+	pq_sendbyte(&output_message, 'w');
 	pq_sendint64(&output_message, startptr);	/* dataStart */
 	pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
 	pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
@@ -2812,16 +2857,22 @@ XLogSendPhysical(void)
 	output_message.len += nbytes;
 	output_message.data[output_message.len] = '\0';
 
-	/*
-	 * Fill the send timestamp last, so that it is taken as late as possible.
-	 */
-	resetStringInfo(&tmpbuf);
-	pq_sendint64(&tmpbuf, GetCurrentTimestamp());
-	memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
-		   tmpbuf.data, sizeof(int64));
-
-	pq_putmessage_noblock('d', output_message.data, output_message.len);
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, output_message.data, output_message.len);
+	}
+	else
+	{
+		/*
+		 * Fill the send timestamp last, so that it is taken as late as possible.
+		 */
+		resetStringInfo(&tmpbuf);
+		pq_sendint64(&tmpbuf, GetCurrentTimestamp());
+		memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
+			   tmpbuf.data, sizeof(int64));
 
+		pq_putmessage_noblock('d', output_message.data, output_message.len);
+	}
 	sentPtr = endptr;
 
 	/* Update shared memory status */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6652a60ec31..2a7f8136142 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -79,6 +79,7 @@
 #include "replication/syncrep.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "replication/walproposer.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
@@ -183,6 +184,7 @@ static int	syslog_facility = 0;
 static void assign_syslog_facility(int newval, void *extra);
 static void assign_syslog_ident(const char *newval, void *extra);
 static void assign_session_replication_role(int newval, void *extra);
+
 static bool check_temp_buffers(int *newval, void **extra, GucSource source);
 static bool check_bonjour(bool *newval, void **extra, GucSource source);
 static bool check_ssl(bool *newval, void **extra, GucSource source);
@@ -2284,6 +2286,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"wal_acceptor_reconnect", PGC_SIGHUP, REPLICATION_STANDBY,
+			gettext_noop("Timeout for reconnecting to offline wal acceptor."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&wal_acceptor_reconnect_timeout,
+		1000, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the maximum number of concurrent connections."),
@@ -4588,6 +4601,17 @@ static struct config_string ConfigureNamesString[] =
 		check_backtrace_functions, assign_backtrace_functions, NULL
 	},
 
+	{
+		{"wal_acceptors", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("List of Zenith WAL acceptors (host:port)"),
+			NULL,
+			GUC_LIST_INPUT | GUC_LIST_QUOTE
+		},
+		&wal_acceptors_list,
+		"",
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
@@ -11739,6 +11763,7 @@ assign_session_replication_role(int newval, void *extra)
 		ResetPlanCache();
 }
 
+
 static bool
 check_temp_buffers(int *newval, void **extra, GucSource source)
 {
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
new file mode 100644
index 00000000000..e1845e3fb19
--- /dev/null
+++ b/src/include/replication/walproposer.h
@@ -0,0 +1,174 @@
+#ifndef __WALKEEPER_H__
+#define __WALKEEPER_H__
+
+#include "postgres.h"
+#include "access/xlog_internal.h"
+#include "access/transam.h"
+#include "nodes/replnodes.h"
+#include "utils/uuid.h"
+
+#define SK_MAGIC              0xCafeCeefu
+#define SK_PROTOCOL_VERSION   1
+
+#define MAX_WALKEEPERS        32
+#define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
+#define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
+#define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
+
+extern char* wal_acceptors_list;
+extern int   wal_acceptor_reconnect_timeout;
+extern bool  am_wal_proposer;
+
+struct WalMessage;
+typedef struct WalMessage WalMessage;
+
+extern char *zenith_timeline_walproposer;
+
+/* WAL safekeeper state */
+typedef enum
+{
+	SS_OFFLINE,
+	SS_CONNECTING,
+	SS_HANDSHAKE,
+	SS_VOTING,
+	SS_WAIT_VERDICT,
+	SS_IDLE,
+	SS_SEND_WAL,
+	SS_RECV_FEEDBACK
+} WalKeeperState;
+
+/*
+ * Unique node identifier used by Paxos
+ */
+typedef struct NodeId
+{
+	uint64     term;
+	pg_uuid_t  uuid;
+} NodeId;
+
+/*
+ * Information about Postgres server broadcasted by WAL proposer to walkeeper
+ */
+typedef struct ServerInfo
+{
+	uint32     protocolVersion;   /* proposer-walkeeper protocol version */
+	uint32     pgVersion;         /* Postgres server version */
+	NodeId     nodeId;
+	uint64     systemId;          /* Postgres system identifier */
+	uint8	   ztimelineid[16];   /* Zenith timeline id */
+	XLogRecPtr walEnd;
+    TimeLineID timeline;
+	int        walSegSize;
+} ServerInfo;
+
+/*
+ * Vote request sent from proposer to walkeepers
+ */
+typedef struct RequestVote
+{
+	NodeId     nodeId;
+	XLogRecPtr VCL;   /* volume commit LSN */
+	uint64     epoch; /* new epoch when walkeeper reaches VCL */
+} RequestVote;
+
+/*
+ * Information of about storage node
+ */
+typedef struct WalKeeperInfo
+{
+	uint32     magic;             /* magic for verifying content the control file */
+	uint32     formatVersion;     /* walkeeper format version */
+	uint64     epoch;             /* walkeeper's epoch */
+	ServerInfo server;
+	XLogRecPtr commitLsn;         /* part of WAL acknowledged by quorum */
+	XLogRecPtr flushLsn;          /* locally flushed part of WAL */
+	XLogRecPtr restartLsn;        /* minimal LSN which may be needed for recovery of some walkeeper: min(commitLsn) for all walkeepers */
+} WalKeeperInfo;
+
+/*
+ * Hot standby feedback received from replica
+ */
+typedef struct HotStandbyFeedback
+{
+	TimestampTz       ts;
+	FullTransactionId xmin;
+	FullTransactionId catalog_xmin;
+} HotStandbyFeedback;
+
+
+/*
+ * Request with WAL message sent from proposer to walkeeper.
+ */
+typedef struct WalKeeperRequest
+{
+	NodeId     senderId;    /* Sender's node identifier (looks like we do not need it for TCP streaming connection) */
+	XLogRecPtr beginLsn;    /* start position of message in WAL */
+	XLogRecPtr endLsn;      /* end position of message in WAL */
+	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
+	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
+} WalKeeperRequest;
+
+/*
+ * All copy data message ('w') are linked in L1 send list and asynchronously sent to receivers.
+ * When message is sent to all receivers, it is removed from send list.
+ */
+struct WalMessage
+{
+	WalMessage* next;      /* L1 list of messages */
+	uint32 size;           /* message size */
+	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
+	WalKeeperRequest req; /* request to walkeeper (message header) */
+};
+
+/*
+ * Report walkeeper state to proposer
+ */
+typedef struct WalKeeperResponse
+{
+	uint64     epoch;
+	XLogRecPtr flushLsn;
+	HotStandbyFeedback hs;
+} WalKeeperResponse;
+
+
+/*
+ * Descriptor of walkeeper
+ */
+typedef struct WalKeeper
+{
+    char const* host;
+    char const* port;
+	pgsocket    sock;     /* socket descriptor */
+	WalMessage* currMsg;  /* message been send to the receiver */
+	int         asyncOffs;/* offset for asynchronus read/write operations */
+	int         eventPos; /* position in wait event set */
+	WalKeeperState state;/* walkeeper state machine state */
+    WalKeeperInfo  info; /* walkeeper info */
+	WalKeeperResponse feedback; /* feedback to master */
+} WalKeeper;
+
+
+int        CompareNodeId(NodeId* id1, NodeId* id2);
+pgsocket   ConnectSocketAsync(char const* host, char const* port, bool* established);
+bool       WriteSocket(pgsocket sock, void const* buf, size_t size);
+ssize_t    ReadSocketAsync(pgsocket sock, void* buf, size_t size);
+ssize_t    WriteSocketAsync(pgsocket sock, void const* buf, size_t size);
+int        CompareLsn(const void *a, const void *b);
+void       WalProposerMain(Datum main_arg);
+void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
+bool       HexDecodeString(uint8 *result, char *input, int nbytes);
+void       WalProposerPoll(void);
+void       WalProposerRegister(void);
+void       ProcessStandbyReply(XLogRecPtr	writePtr,
+							   XLogRecPtr	flushPtr,
+							   XLogRecPtr	applyPtr,
+							   TimestampTz replyTime,
+							   bool		replyRequested);
+void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
+									TransactionId feedbackXmin,
+									uint32		feedbackEpoch,
+									TransactionId feedbackCatalogXmin,
+									uint32		feedbackCatalogEpoch);
+void       StartReplication(StartReplicationCmd *cmd);
+
+#endif

From 782eacfd2390ecdb66a277fb5e6fb9f2b1047a35 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:41:11 +0300
Subject: [PATCH 008/167] persist_unlogged_tables.patch

Ignore unlogged table qualifier. Add respective changes to regression test outputs.

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/commands/tablecmds.c             |    6 +
 src/test/regress/expected/alter_table_1.out  | 4487 ++++++++++++++++++
 src/test/regress/expected/create_table_1.out | 1315 +++++
 3 files changed, 5808 insertions(+)
 create mode 100644 src/test/regress/expected/alter_table_1.out
 create mode 100644 src/test/regress/expected/create_table_1.out

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 4729a895e83..5bb98b6d116 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -693,6 +693,12 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("cannot create temporary table within security-restricted operation")));
 
+	if (stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED)
+	{
+		/* Unlogged tables are not supported by Zenith */
+		stmt->relation->relpersistence = RELPERSISTENCE_PERMANENT;
+	}
+
 	/*
 	 * Determine the lockmode to use when scanning parents.  A self-exclusive
 	 * lock is needed here.
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
new file mode 100644
index 00000000000..0f116d8750d
--- /dev/null
+++ b/src/test/regress/expected/alter_table_1.out
@@ -0,0 +1,4487 @@
+--
+-- ALTER_TABLE
+--
+-- Clean up in case a prior regression run failed
+SET client_min_messages TO 'warning';
+DROP ROLE IF EXISTS regress_alter_table_user1;
+RESET client_min_messages;
+CREATE USER regress_alter_table_user1;
+--
+-- add attribute
+--
+CREATE TABLE attmp (initial int4);
+COMMENT ON TABLE attmp_wrong IS 'table comment';
+ERROR:  relation "attmp_wrong" does not exist
+COMMENT ON TABLE attmp IS 'table comment';
+COMMENT ON TABLE attmp IS NULL;
+ALTER TABLE attmp ADD COLUMN xmin integer; -- fails
+ERROR:  column name "xmin" conflicts with a system column name
+ALTER TABLE attmp ADD COLUMN a int4 default 3;
+ALTER TABLE attmp ADD COLUMN b name;
+ALTER TABLE attmp ADD COLUMN c text;
+ALTER TABLE attmp ADD COLUMN d float8;
+ALTER TABLE attmp ADD COLUMN e float4;
+ALTER TABLE attmp ADD COLUMN f int2;
+ALTER TABLE attmp ADD COLUMN g polygon;
+ALTER TABLE attmp ADD COLUMN i char;
+ALTER TABLE attmp ADD COLUMN k int4;
+ALTER TABLE attmp ADD COLUMN l tid;
+ALTER TABLE attmp ADD COLUMN m xid;
+ALTER TABLE attmp ADD COLUMN n oidvector;
+--ALTER TABLE attmp ADD COLUMN o lock;
+ALTER TABLE attmp ADD COLUMN p boolean;
+ALTER TABLE attmp ADD COLUMN q point;
+ALTER TABLE attmp ADD COLUMN r lseg;
+ALTER TABLE attmp ADD COLUMN s path;
+ALTER TABLE attmp ADD COLUMN t box;
+ALTER TABLE attmp ADD COLUMN v timestamp;
+ALTER TABLE attmp ADD COLUMN w interval;
+ALTER TABLE attmp ADD COLUMN x float8[];
+ALTER TABLE attmp ADD COLUMN y float4[];
+ALTER TABLE attmp ADD COLUMN z int2[];
+INSERT INTO attmp (a, b, c, d, e, f, g,    i,    k, l, m, n, p, q, r, s, t,
+	v, w, x, y, z)
+   VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)',
+	'c',
+	314159, '(1,1)', '512',
+	'1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)',
+	'(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)',
+	'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}');
+SELECT * FROM attmp;
+ initial | a |  b   |  c   |  d  |  e  | f |           g           | i |   k    |   l   |  m  |        n        | p |     q     |           r           |              s              |          t          |            v             |        w         |     x     |     y     |     z     
+---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+-----------
+         | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4}
+(1 row)
+
+DROP TABLE attmp;
+-- the wolf bug - schema mods caused inconsistent row descriptors
+CREATE TABLE attmp (
+	initial 	int4
+);
+ALTER TABLE attmp ADD COLUMN a int4;
+ALTER TABLE attmp ADD COLUMN b name;
+ALTER TABLE attmp ADD COLUMN c text;
+ALTER TABLE attmp ADD COLUMN d float8;
+ALTER TABLE attmp ADD COLUMN e float4;
+ALTER TABLE attmp ADD COLUMN f int2;
+ALTER TABLE attmp ADD COLUMN g polygon;
+ALTER TABLE attmp ADD COLUMN i char;
+ALTER TABLE attmp ADD COLUMN k int4;
+ALTER TABLE attmp ADD COLUMN l tid;
+ALTER TABLE attmp ADD COLUMN m xid;
+ALTER TABLE attmp ADD COLUMN n oidvector;
+--ALTER TABLE attmp ADD COLUMN o lock;
+ALTER TABLE attmp ADD COLUMN p boolean;
+ALTER TABLE attmp ADD COLUMN q point;
+ALTER TABLE attmp ADD COLUMN r lseg;
+ALTER TABLE attmp ADD COLUMN s path;
+ALTER TABLE attmp ADD COLUMN t box;
+ALTER TABLE attmp ADD COLUMN v timestamp;
+ALTER TABLE attmp ADD COLUMN w interval;
+ALTER TABLE attmp ADD COLUMN x float8[];
+ALTER TABLE attmp ADD COLUMN y float4[];
+ALTER TABLE attmp ADD COLUMN z int2[];
+INSERT INTO attmp (a, b, c, d, e, f, g,    i,   k, l, m, n, p, q, r, s, t,
+	v, w, x, y, z)
+   VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)',
+        'c',
+	314159, '(1,1)', '512',
+	'1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)',
+	'(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)',
+	'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}');
+SELECT * FROM attmp;
+ initial | a |  b   |  c   |  d  |  e  | f |           g           | i |   k    |   l   |  m  |        n        | p |     q     |           r           |              s              |          t          |            v             |        w         |     x     |     y     |     z     
+---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+-----------
+         | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4}
+(1 row)
+
+CREATE INDEX attmp_idx ON attmp (a, (d + e), b);
+ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000;
+ERROR:  column number must be in range from 1 to 32767
+LINE 1: ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000;
+                                           ^
+ALTER INDEX attmp_idx ALTER COLUMN 1 SET STATISTICS 1000;
+ERROR:  cannot alter statistics on non-expression column "a" of index "attmp_idx"
+HINT:  Alter statistics on table column instead.
+ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS 1000;
+\d+ attmp_idx
+                        Index "public.attmp_idx"
+ Column |       Type       | Key? | Definition | Storage | Stats target 
+--------+------------------+------+------------+---------+--------------
+ a      | integer          | yes  | a          | plain   | 
+ expr   | double precision | yes  | (d + e)    | plain   | 1000
+ b      | cstring          | yes  | b          | plain   | 
+btree, for table "public.attmp"
+
+ALTER INDEX attmp_idx ALTER COLUMN 3 SET STATISTICS 1000;
+ERROR:  cannot alter statistics on non-expression column "b" of index "attmp_idx"
+HINT:  Alter statistics on table column instead.
+ALTER INDEX attmp_idx ALTER COLUMN 4 SET STATISTICS 1000;
+ERROR:  column number 4 of relation "attmp_idx" does not exist
+ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS -1;
+DROP TABLE attmp;
+--
+-- rename - check on both non-temp and temp tables
+--
+CREATE TABLE attmp (regtable int);
+CREATE TEMP TABLE attmp (attmptable int);
+ALTER TABLE attmp RENAME TO attmp_new;
+SELECT * FROM attmp;
+ regtable 
+----------
+(0 rows)
+
+SELECT * FROM attmp_new;
+ attmptable 
+------------
+(0 rows)
+
+ALTER TABLE attmp RENAME TO attmp_new2;
+SELECT * FROM attmp;		-- should fail
+ERROR:  relation "attmp" does not exist
+LINE 1: SELECT * FROM attmp;
+                      ^
+SELECT * FROM attmp_new;
+ attmptable 
+------------
+(0 rows)
+
+SELECT * FROM attmp_new2;
+ regtable 
+----------
+(0 rows)
+
+DROP TABLE attmp_new;
+DROP TABLE attmp_new2;
+-- check rename of partitioned tables and indexes also
+CREATE TABLE part_attmp (a int primary key) partition by range (a);
+CREATE TABLE part_attmp1 PARTITION OF part_attmp FOR VALUES FROM (0) TO (100);
+ALTER INDEX part_attmp_pkey RENAME TO part_attmp_index;
+ALTER INDEX part_attmp1_pkey RENAME TO part_attmp1_index;
+ALTER TABLE part_attmp RENAME TO part_at2tmp;
+ALTER TABLE part_attmp1 RENAME TO part_at2tmp1;
+SET ROLE regress_alter_table_user1;
+ALTER INDEX part_attmp_index RENAME TO fail;
+ERROR:  must be owner of index part_attmp_index
+ALTER INDEX part_attmp1_index RENAME TO fail;
+ERROR:  must be owner of index part_attmp1_index
+ALTER TABLE part_at2tmp RENAME TO fail;
+ERROR:  must be owner of table part_at2tmp
+ALTER TABLE part_at2tmp1 RENAME TO fail;
+ERROR:  must be owner of table part_at2tmp1
+RESET ROLE;
+DROP TABLE part_at2tmp;
+--
+-- check renaming to a table's array type's autogenerated name
+-- (the array type's name should get out of the way)
+--
+CREATE TABLE attmp_array (id int);
+CREATE TABLE attmp_array2 (id int);
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+   typname    
+--------------
+ _attmp_array
+(1 row)
+
+SELECT typname FROM pg_type WHERE oid = 'attmp_array2[]'::regtype;
+    typname    
+---------------
+ _attmp_array2
+(1 row)
+
+ALTER TABLE attmp_array2 RENAME TO _attmp_array;
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+    typname    
+---------------
+ __attmp_array
+(1 row)
+
+SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype;
+    typname     
+----------------
+ ___attmp_array
+(1 row)
+
+DROP TABLE _attmp_array;
+DROP TABLE attmp_array;
+-- renaming to table's own array type's name is an interesting corner case
+CREATE TABLE attmp_array (id int);
+SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype;
+   typname    
+--------------
+ _attmp_array
+(1 row)
+
+ALTER TABLE attmp_array RENAME TO _attmp_array;
+SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype;
+    typname    
+---------------
+ __attmp_array
+(1 row)
+
+DROP TABLE _attmp_array;
+-- ALTER TABLE ... RENAME on non-table relations
+-- renaming indexes (FIXME: this should probably test the index's functionality)
+ALTER INDEX IF EXISTS __onek_unique1 RENAME TO attmp_onek_unique1;
+NOTICE:  relation "__onek_unique1" does not exist, skipping
+ALTER INDEX IF EXISTS __attmp_onek_unique1 RENAME TO onek_unique1;
+NOTICE:  relation "__attmp_onek_unique1" does not exist, skipping
+ALTER INDEX onek_unique1 RENAME TO attmp_onek_unique1;
+ALTER INDEX attmp_onek_unique1 RENAME TO onek_unique1;
+SET ROLE regress_alter_table_user1;
+ALTER INDEX onek_unique1 RENAME TO fail;  -- permission denied
+ERROR:  must be owner of index onek_unique1
+RESET ROLE;
+-- renaming views
+CREATE VIEW attmp_view (unique1) AS SELECT unique1 FROM tenk1;
+ALTER TABLE attmp_view RENAME TO attmp_view_new;
+SET ROLE regress_alter_table_user1;
+ALTER VIEW attmp_view_new RENAME TO fail;  -- permission denied
+ERROR:  must be owner of view attmp_view_new
+RESET ROLE;
+-- hack to ensure we get an indexscan here
+set enable_seqscan to off;
+set enable_bitmapscan to off;
+-- 5 values, sorted
+SELECT unique1 FROM tenk1 WHERE unique1 < 5;
+ unique1 
+---------
+       0
+       1
+       2
+       3
+       4
+(5 rows)
+
+reset enable_seqscan;
+reset enable_bitmapscan;
+DROP VIEW attmp_view_new;
+-- toast-like relation name
+alter table stud_emp rename to pg_toast_stud_emp;
+alter table pg_toast_stud_emp rename to stud_emp;
+-- renaming index should rename constraint as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraint
+ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0);
+ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo;
+ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo;
+-- renaming constraint should rename index as well
+ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1);
+DROP INDEX onek_unique1_constraint;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint on table onek instead.
+ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo;
+DROP INDEX onek_unique1_constraint_foo;  -- to see whether it's there
+ERROR:  cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it
+HINT:  You can drop constraint onek_unique1_constraint_foo on table onek instead.
+ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo;
+-- renaming constraints vs. inheritance
+CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int);
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1" CHECK (a > 0)
+
+CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging constraint "con1" with inherited definition
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  cannot rename inherited constraint "con1"
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail
+ERROR:  inherited constraint "con1" must be renamed in child tables too
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0) NO INHERIT;
+ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+    "con2bar" CHECK (b > 0) NO INHERIT
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a);
+ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok
+\d constraint_rename_test
+       Table "public.constraint_rename_test"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+Indexes:
+    "con3foo" PRIMARY KEY, btree (a)
+Check constraints:
+    "con1foo" CHECK (a > 0)
+    "con2bar" CHECK (b > 0) NO INHERIT
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d constraint_rename_test2
+      Table "public.constraint_rename_test2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+ c      | integer |           |          | 
+ d      | integer |           |          | 
+Check constraints:
+    "con1foo" CHECK (a > 0)
+Inherits: constraint_rename_test
+
+DROP TABLE constraint_rename_test2;
+DROP TABLE constraint_rename_test;
+ALTER TABLE IF EXISTS constraint_not_exist RENAME CONSTRAINT con3 TO con3foo; -- ok
+NOTICE:  relation "constraint_not_exist" does not exist, skipping
+ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a);
+NOTICE:  relation "constraint_rename_test" does not exist, skipping
+-- renaming constraints with cache reset of target relation
+CREATE TABLE constraint_rename_cache (a int,
+  CONSTRAINT chk_a CHECK (a > 0),
+  PRIMARY KEY (a));
+ALTER TABLE constraint_rename_cache
+  RENAME CONSTRAINT chk_a TO chk_a_new;
+ALTER TABLE constraint_rename_cache
+  RENAME CONSTRAINT constraint_rename_cache_pkey TO constraint_rename_pkey_new;
+CREATE TABLE like_constraint_rename_cache
+  (LIKE constraint_rename_cache INCLUDING ALL);
+\d like_constraint_rename_cache
+    Table "public.like_constraint_rename_cache"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+Indexes:
+    "like_constraint_rename_cache_pkey" PRIMARY KEY, btree (a)
+Check constraints:
+    "chk_a_new" CHECK (a > 0)
+
+DROP TABLE constraint_rename_cache;
+DROP TABLE like_constraint_rename_cache;
+-- FOREIGN KEY CONSTRAINT adding TEST
+CREATE TABLE attmp2 (a int primary key);
+CREATE TABLE attmp3 (a int, b int);
+CREATE TABLE attmp4 (a int, b int, unique(a,b));
+CREATE TABLE attmp5 (a int, b int);
+-- Insert rows into attmp2 (pktable)
+INSERT INTO attmp2 values (1);
+INSERT INTO attmp2 values (2);
+INSERT INTO attmp2 values (3);
+INSERT INTO attmp2 values (4);
+-- Insert rows into attmp3
+INSERT INTO attmp3 values (1,10);
+INSERT INTO attmp3 values (1,20);
+INSERT INTO attmp3 values (5,50);
+-- Try (and fail) to add constraint due to invalid source columns
+ALTER TABLE attmp3 add constraint attmpconstr foreign key(c) references attmp2 match full;
+ERROR:  column "c" referenced in foreign key constraint does not exist
+-- Try (and fail) to add constraint due to invalid destination columns explicitly given
+ALTER TABLE attmp3 add constraint attmpconstr foreign key(a) references attmp2(b) match full;
+ERROR:  column "b" referenced in foreign key constraint does not exist
+-- Try (and fail) to add constraint due to invalid data
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full;
+ERROR:  insert or update on table "attmp3" violates foreign key constraint "attmpconstr"
+DETAIL:  Key (a)=(5) is not present in table "attmp2".
+-- Delete failing row
+DELETE FROM attmp3 where a=5;
+-- Try (and succeed)
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full;
+ALTER TABLE attmp3 drop constraint attmpconstr;
+INSERT INTO attmp3 values (5,50);
+-- Try NOT VALID and then VALIDATE CONSTRAINT, but fails. Delete failure then re-validate
+ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full NOT VALID;
+ALTER TABLE attmp3 validate constraint attmpconstr;
+ERROR:  insert or update on table "attmp3" violates foreign key constraint "attmpconstr"
+DETAIL:  Key (a)=(5) is not present in table "attmp2".
+-- Delete failing row
+DELETE FROM attmp3 where a=5;
+-- Try (and succeed) and repeat to show it works on already valid constraint
+ALTER TABLE attmp3 validate constraint attmpconstr;
+ALTER TABLE attmp3 validate constraint attmpconstr;
+-- Try a non-verified CHECK constraint
+ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10); -- fail
+ERROR:  check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row
+ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10) NOT VALID; -- succeeds
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- fails
+ERROR:  check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row
+DELETE FROM attmp3 WHERE NOT b > 10;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds
+-- Test inherited NOT VALID CHECK constraints
+select * from attmp3;
+ a | b  
+---+----
+ 1 | 20
+(1 row)
+
+CREATE TABLE attmp6 () INHERITS (attmp3);
+CREATE TABLE attmp7 () INHERITS (attmp3);
+INSERT INTO attmp6 VALUES (6, 30), (7, 16);
+ALTER TABLE attmp3 ADD CONSTRAINT b_le_20 CHECK (b <= 20) NOT VALID;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20;	-- fails
+ERROR:  check constraint "b_le_20" of relation "attmp6" is violated by some row
+DELETE FROM attmp6 WHERE b > 20;
+ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20;	-- succeeds
+-- An already validated constraint must not be revalidated
+CREATE FUNCTION boo(int) RETURNS int IMMUTABLE STRICT LANGUAGE plpgsql AS $$ BEGIN RAISE NOTICE 'boo: %', $1; RETURN $1; END; $$;
+INSERT INTO attmp7 VALUES (8, 18);
+ALTER TABLE attmp7 ADD CONSTRAINT identity CHECK (b = boo(b));
+NOTICE:  boo: 18
+ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
+NOTICE:  merging constraint "identity" with inherited definition
+ALTER TABLE attmp3 VALIDATE CONSTRAINT identity;
+NOTICE:  boo: 20
+NOTICE:  boo: 16
+-- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
+create table parent_noinh_convalid (a int);
+create table child_noinh_convalid () inherits (parent_noinh_convalid);
+insert into parent_noinh_convalid values (1);
+insert into child_noinh_convalid values (1);
+alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid;
+-- fail, because of the row in parent
+alter table parent_noinh_convalid validate constraint check_a_is_2;
+ERROR:  check constraint "check_a_is_2" of relation "parent_noinh_convalid" is violated by some row
+delete from only parent_noinh_convalid;
+-- ok (parent itself contains no violating rows)
+alter table parent_noinh_convalid validate constraint check_a_is_2;
+select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2';
+ convalidated 
+--------------
+ t
+(1 row)
+
+-- cleanup
+drop table parent_noinh_convalid, child_noinh_convalid;
+-- Try (and fail) to create constraint from attmp5(a) to attmp4(a) - unique constraint on
+-- attmp4 is a,b
+ALTER TABLE attmp5 add constraint attmpconstr foreign key(a) references attmp4(a) match full;
+ERROR:  there is no unique constraint matching given keys for referenced table "attmp4"
+DROP TABLE attmp7;
+DROP TABLE attmp6;
+DROP TABLE attmp5;
+DROP TABLE attmp4;
+DROP TABLE attmp3;
+DROP TABLE attmp2;
+-- NOT VALID with plan invalidation -- ensure we don't use a constraint for
+-- exclusion until validated
+set constraint_exclusion TO 'partition';
+create table nv_parent (d date, check (false) no inherit not valid);
+-- not valid constraint added at creation time should automatically become valid
+\d nv_parent
+            Table "public.nv_parent"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ d      | date |           |          | 
+Check constraints:
+    "nv_parent_check" CHECK (false) NO INHERIT
+
+create table nv_child_2010 () inherits (nv_parent);
+create table nv_child_2011 () inherits (nv_parent);
+alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid;
+alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid;
+explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31';
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2011 nv_parent_3
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+(7 rows)
+
+create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent);
+explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+   ->  Seq Scan on nv_child_2011 nv_parent_3
+         Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date))
+(7 rows)
+
+explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2011 nv_parent_3
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2009 nv_parent_4
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+(9 rows)
+
+-- after validation, the constraint should be used
+alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check;
+explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on nv_parent nv_parent_1
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2010 nv_parent_2
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+   ->  Seq Scan on nv_child_2009 nv_parent_3
+         Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date))
+(7 rows)
+
+-- add an inherited NOT VALID constraint
+alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid;
+\d nv_child_2009
+          Table "public.nv_child_2009"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ d      | date |           |          | 
+Check constraints:
+    "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date)
+    "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID
+Inherits: nv_parent
+
+-- we leave nv_parent and children around to help test pg_dump logic
+-- Foreign key adding test with mixed types
+-- Note: these tables are TEMP to avoid name conflicts when this test
+-- is run in parallel with foreign_key.sql.
+CREATE TEMP TABLE PKTABLE (ptest1 int PRIMARY KEY);
+INSERT INTO PKTABLE VALUES(42);
+CREATE TEMP TABLE FKTABLE (ftest1 inet);
+-- This next should fail, because int=inet does not exist
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer.
+-- This should also fail for the same reason, but here we
+-- give the column name
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable(ptest1);
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer.
+DROP TABLE FKTABLE;
+-- This should succeed, even though they are different types,
+-- because int=int8 exists and is a member of the integer opfamily
+CREATE TEMP TABLE FKTABLE (ftest1 int8);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+-- Check it actually works
+INSERT INTO FKTABLE VALUES(42);		-- should succeed
+INSERT INTO FKTABLE VALUES(43);		-- should fail
+ERROR:  insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey"
+DETAIL:  Key (ftest1)=(43) is not present in table "pktable".
+DROP TABLE FKTABLE;
+-- This should fail, because we'd have to cast numeric to int which is
+-- not an implicit coercion (or use numeric=numeric, but that's not part
+-- of the integer opfamily)
+CREATE TEMP TABLE FKTABLE (ftest1 numeric);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: numeric and integer.
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+-- On the other hand, this should work because int implicitly promotes to
+-- numeric, and we allow promotion on the FK side
+CREATE TEMP TABLE PKTABLE (ptest1 numeric PRIMARY KEY);
+INSERT INTO PKTABLE VALUES(42);
+CREATE TEMP TABLE FKTABLE (ftest1 int);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable;
+-- Check it actually works
+INSERT INTO FKTABLE VALUES(42);		-- should succeed
+INSERT INTO FKTABLE VALUES(43);		-- should fail
+ERROR:  insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey"
+DETAIL:  Key (ftest1)=(43) is not present in table "pktable".
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+CREATE TEMP TABLE PKTABLE (ptest1 int, ptest2 inet,
+                           PRIMARY KEY(ptest1, ptest2));
+-- This should fail, because we just chose really odd types
+CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) references pktable;
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer.
+DROP TABLE FKTABLE;
+-- Again, so should this...
+CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2)
+     references pktable(ptest1, ptest2);
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer.
+DROP TABLE FKTABLE;
+-- This fails because we mixed up the column ordering
+CREATE TEMP TABLE FKTABLE (ftest1 int, ftest2 inet);
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2)
+     references pktable(ptest2, ptest1);
+ERROR:  foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented
+DETAIL:  Key columns "ftest1" and "ptest2" are of incompatible types: integer and inet.
+-- As does this...
+ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1)
+     references pktable(ptest1, ptest2);
+ERROR:  foreign key constraint "fktable_ftest2_ftest1_fkey" cannot be implemented
+DETAIL:  Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer.
+DROP TABLE FKTABLE;
+DROP TABLE PKTABLE;
+-- Test that ALTER CONSTRAINT updates trigger deferrability properly
+CREATE TEMP TABLE PKTABLE (ptest1 int primary key);
+CREATE TEMP TABLE FKTABLE (ftest1 int);
+ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED;
+ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable
+  ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE;
+ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE;
+SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+WHERE tgrelid = 'pktable'::regclass
+ORDER BY 1,2,3;
+ conname |         tgfoid         | tgtype | tgdeferrable | tginitdeferred 
+---------+------------------------+--------+--------------+----------------
+ fkdd    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdd    | "RI_FKey_noaction_upd" |     17 | t            | t
+ fkdd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdd2   | "RI_FKey_noaction_upd" |     17 | t            | t
+ fkdi    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdi    | "RI_FKey_noaction_upd" |     17 | t            | f
+ fkdi2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fkdi2   | "RI_FKey_noaction_upd" |     17 | t            | f
+ fknd    | "RI_FKey_cascade_del"  |      9 | f            | f
+ fknd    | "RI_FKey_noaction_upd" |     17 | f            | f
+ fknd2   | "RI_FKey_cascade_del"  |      9 | f            | f
+ fknd2   | "RI_FKey_noaction_upd" |     17 | f            | f
+(12 rows)
+
+SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred
+FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint
+WHERE tgrelid = 'fktable'::regclass
+ORDER BY 1,2,3;
+ conname |       tgfoid        | tgtype | tgdeferrable | tginitdeferred 
+---------+---------------------+--------+--------------+----------------
+ fkdd    | "RI_FKey_check_ins" |      5 | t            | t
+ fkdd    | "RI_FKey_check_upd" |     17 | t            | t
+ fkdd2   | "RI_FKey_check_ins" |      5 | t            | t
+ fkdd2   | "RI_FKey_check_upd" |     17 | t            | t
+ fkdi    | "RI_FKey_check_ins" |      5 | t            | f
+ fkdi    | "RI_FKey_check_upd" |     17 | t            | f
+ fkdi2   | "RI_FKey_check_ins" |      5 | t            | f
+ fkdi2   | "RI_FKey_check_upd" |     17 | t            | f
+ fknd    | "RI_FKey_check_ins" |      5 | f            | f
+ fknd    | "RI_FKey_check_upd" |     17 | f            | f
+ fknd2   | "RI_FKey_check_ins" |      5 | f            | f
+ fknd2   | "RI_FKey_check_upd" |     17 | f            | f
+(12 rows)
+
+-- temp tables should go away by themselves, need not drop them.
+-- test check constraint adding
+create table atacc1 ( test int );
+-- add a check constraint
+alter table atacc1 add constraint atacc_test1 check (test>3);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc_test1"
+DETAIL:  Failing row contains (2).
+-- should succeed
+insert into atacc1 (test) values (4);
+drop table atacc1;
+-- let's do one where the check fails when added
+create table atacc1 ( test int );
+-- insert a soon to be failing row
+insert into atacc1 (test) values (2);
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test>3);
+ERROR:  check constraint "atacc_test1" of relation "atacc1" is violated by some row
+insert into atacc1 (test) values (4);
+drop table atacc1;
+-- let's do one where the check fails because the column doesn't exist
+create table atacc1 ( test int );
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test1>3);
+ERROR:  column "test1" does not exist
+HINT:  Perhaps you meant to reference the column "atacc1.test".
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int, test3 int);
+-- add a check constraint (fails)
+alter table atacc1 add constraint atacc_test1 check (test+test2<test3*4);
+-- should fail
+insert into atacc1 (test,test2,test3) values (4,4,2);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc_test1"
+DETAIL:  Failing row contains (4, 4, 2).
+-- should succeed
+insert into atacc1 (test,test2,test3) values (4,4,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int check (test>3), test2 int);
+alter table atacc1 add check (test2>test);
+-- should fail for $2
+insert into atacc1 (test2, test) values (3, 4);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_check"
+DETAIL:  Failing row contains (4, 3).
+drop table atacc1;
+-- inheritance related tests
+create table atacc1 (test int);
+create table atacc2 (test2 int);
+create table atacc3 (test3 int) inherits (atacc1, atacc2);
+alter table atacc2 add constraint foo check (test2>0);
+-- fail and then succeed on atacc2
+insert into atacc2 (test2) values (-3);
+ERROR:  new row for relation "atacc2" violates check constraint "foo"
+DETAIL:  Failing row contains (-3).
+insert into atacc2 (test2) values (3);
+-- fail and then succeed on atacc3
+insert into atacc3 (test2) values (-3);
+ERROR:  new row for relation "atacc3" violates check constraint "foo"
+DETAIL:  Failing row contains (null, -3, null).
+insert into atacc3 (test2) values (3);
+drop table atacc3;
+drop table atacc2;
+drop table atacc1;
+-- same things with one created with INHERIT
+create table atacc1 (test int);
+create table atacc2 (test2 int);
+create table atacc3 (test3 int) inherits (atacc1, atacc2);
+alter table atacc3 no inherit atacc2;
+-- fail
+alter table atacc3 no inherit atacc2;
+ERROR:  relation "atacc2" is not a parent of relation "atacc3"
+-- make sure it really isn't a child
+insert into atacc3 (test2) values (3);
+select test2 from atacc2;
+ test2 
+-------
+(0 rows)
+
+-- fail due to missing constraint
+alter table atacc2 add constraint foo check (test2>0);
+alter table atacc3 inherit atacc2;
+ERROR:  child table is missing constraint "foo"
+-- fail due to missing column
+alter table atacc3 rename test2 to testx;
+alter table atacc3 inherit atacc2;
+ERROR:  child table is missing column "test2"
+-- fail due to mismatched data type
+alter table atacc3 add test2 bool;
+alter table atacc3 inherit atacc2;
+ERROR:  child table "atacc3" has different type for column "test2"
+alter table atacc3 drop test2;
+-- succeed
+alter table atacc3 add test2 int;
+update atacc3 set test2 = 4 where test2 is null;
+alter table atacc3 add constraint foo check (test2>0);
+alter table atacc3 inherit atacc2;
+-- fail due to duplicates and circular inheritance
+alter table atacc3 inherit atacc2;
+ERROR:  relation "atacc2" would be inherited from more than once
+alter table atacc2 inherit atacc3;
+ERROR:  circular inheritance not allowed
+DETAIL:  "atacc3" is already a child of "atacc2".
+alter table atacc2 inherit atacc2;
+ERROR:  circular inheritance not allowed
+DETAIL:  "atacc2" is already a child of "atacc2".
+-- test that we really are a child now (should see 4 not 3 and cascade should go through)
+select test2 from atacc2;
+ test2 
+-------
+     4
+(1 row)
+
+drop table atacc2 cascade;
+NOTICE:  drop cascades to table atacc3
+drop table atacc1;
+-- adding only to a parent is allowed as of 9.2
+create table atacc1 (test int);
+create table atacc2 (test2 int) inherits (atacc1);
+-- ok:
+alter table atacc1 add constraint foo check (test>0) no inherit;
+-- check constraint is not there on child
+insert into atacc2 (test) values (-3);
+-- check constraint is there on parent
+insert into atacc1 (test) values (-3);
+ERROR:  new row for relation "atacc1" violates check constraint "foo"
+DETAIL:  Failing row contains (-3).
+insert into atacc1 (test) values (3);
+-- fail, violating row:
+alter table atacc2 add constraint foo check (test>0) no inherit;
+ERROR:  check constraint "foo" of relation "atacc2" is violated by some row
+drop table atacc2;
+drop table atacc1;
+-- test unique constraint adding
+create table atacc1 ( test int ) ;
+-- add a unique constraint
+alter table atacc1 add constraint atacc_test1 unique (test);
+-- insert first value
+insert into atacc1 (test) values (2);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test)=(2) already exists.
+-- should succeed
+insert into atacc1 (test) values (4);
+-- try to create duplicates via alter table using - should fail
+alter table atacc1 alter column test type integer using 0;
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(0) is duplicated.
+drop table atacc1;
+-- let's do one where the unique constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing rows
+insert into atacc1 (test) values (2);
+insert into atacc1 (test) values (2);
+-- add a unique constraint (fails)
+alter table atacc1 add constraint atacc_test1 unique (test);
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(2) is duplicated.
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do one where the unique constraint fails
+-- because the column doesn't exist
+create table atacc1 ( test int );
+-- add a unique constraint (fails)
+alter table atacc1 add constraint atacc_test1 unique (test1);
+ERROR:  column "test1" named in key does not exist
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int);
+-- add a unique constraint
+alter table atacc1 add constraint atacc_test1 unique (test, test2);
+-- insert initial value
+insert into atacc1 (test,test2) values (4,4);
+-- should fail
+insert into atacc1 (test,test2) values (4,4);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test, test2)=(4, 4) already exists.
+-- should all succeed
+insert into atacc1 (test,test2) values (4,5);
+insert into atacc1 (test,test2) values (5,4);
+insert into atacc1 (test,test2) values (5,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int, test2 int, unique(test));
+alter table atacc1 add unique (test2);
+-- should fail for @@ second one @@
+insert into atacc1 (test2, test) values (3, 3);
+insert into atacc1 (test2, test) values (2, 3);
+ERROR:  duplicate key value violates unique constraint "atacc1_test_key"
+DETAIL:  Key (test)=(3) already exists.
+drop table atacc1;
+-- test primary key constraint adding
+create table atacc1 ( id serial, test int) ;
+-- add a primary key constraint
+alter table atacc1 add constraint atacc_test1 primary key (test);
+-- insert first value
+insert into atacc1 (test) values (2);
+-- should fail
+insert into atacc1 (test) values (2);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test)=(2) already exists.
+-- should succeed
+insert into atacc1 (test) values (4);
+-- inserting NULL should fail
+insert into atacc1 (test) values(NULL);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (4, null).
+-- try adding a second primary key (should fail)
+alter table atacc1 add constraint atacc_oid1 primary key(id);
+ERROR:  multiple primary keys for table "atacc1" are not allowed
+-- drop first primary key constraint
+alter table atacc1 drop constraint atacc_test1 restrict;
+-- try adding a primary key on oid (should succeed)
+alter table atacc1 add constraint atacc_oid1 primary key(id);
+drop table atacc1;
+-- let's do one where the primary key constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing rows
+insert into atacc1 (test) values (2);
+insert into atacc1 (test) values (2);
+-- add a primary key (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test);
+ERROR:  could not create unique index "atacc_test1"
+DETAIL:  Key (test)=(2) is duplicated.
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do another one where the primary key constraint fails when added
+create table atacc1 ( test int );
+-- insert soon to be failing row
+insert into atacc1 (test) values (NULL);
+-- add a primary key (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test);
+ERROR:  column "test" of relation "atacc1" contains null values
+insert into atacc1 (test) values (3);
+drop table atacc1;
+-- let's do one where the primary key constraint fails
+-- because the column doesn't exist
+create table atacc1 ( test int );
+-- add a primary key constraint (fails)
+alter table atacc1 add constraint atacc_test1 primary key (test1);
+ERROR:  column "test1" of relation "atacc1" does not exist
+drop table atacc1;
+-- adding a new column as primary key to a non-empty table.
+-- should fail unless the column has a non-null default value.
+create table atacc1 ( test int );
+insert into atacc1 (test) values (0);
+-- add a primary key column without a default (fails).
+alter table atacc1 add column test2 int primary key;
+ERROR:  column "test2" of relation "atacc1" contains null values
+-- now add a primary key column with a default (succeeds).
+alter table atacc1 add column test2 int default 0 primary key;
+drop table atacc1;
+-- this combination used to have order-of-execution problems (bug #15580)
+create table atacc1 (a int);
+insert into atacc1 values(1);
+alter table atacc1
+  add column b float8 not null default random(),
+  add primary key(a);
+drop table atacc1;
+-- additionally, we've seen issues with foreign key validation not being
+-- properly delayed until after a table rewrite.  Check that works ok.
+create table atacc1 (a int primary key);
+alter table atacc1 add constraint atacc1_fkey foreign key (a) references atacc1 (a) not valid;
+alter table atacc1 validate constraint atacc1_fkey, alter a type bigint;
+drop table atacc1;
+-- we've also seen issues with check constraints being validated at the wrong
+-- time when there's a pending table rewrite.
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,1);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+drop table atacc1;
+-- same as above, but ensure the constraint violation is detected
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,2);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+ERROR:  check constraint "atacc1_chk" of relation "atacc1" is violated by some row
+drop table atacc1;
+-- something a little more complicated
+create table atacc1 ( test int, test2 int);
+-- add a primary key constraint
+alter table atacc1 add constraint atacc_test1 primary key (test, test2);
+-- try adding a second primary key - should fail
+alter table atacc1 add constraint atacc_test2 primary key (test);
+ERROR:  multiple primary keys for table "atacc1" are not allowed
+-- insert initial value
+insert into atacc1 (test,test2) values (4,4);
+-- should fail
+insert into atacc1 (test,test2) values (4,4);
+ERROR:  duplicate key value violates unique constraint "atacc_test1"
+DETAIL:  Key (test, test2)=(4, 4) already exists.
+insert into atacc1 (test,test2) values (NULL,3);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, 3).
+insert into atacc1 (test,test2) values (3, NULL);
+ERROR:  null value in column "test2" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (3, null).
+insert into atacc1 (test,test2) values (NULL,NULL);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, null).
+-- should all succeed
+insert into atacc1 (test,test2) values (4,5);
+insert into atacc1 (test,test2) values (5,4);
+insert into atacc1 (test,test2) values (5,5);
+drop table atacc1;
+-- lets do some naming tests
+create table atacc1 (test int, test2 int, primary key(test));
+-- only first should succeed
+insert into atacc1 (test2, test) values (3, 3);
+insert into atacc1 (test2, test) values (2, 3);
+ERROR:  duplicate key value violates unique constraint "atacc1_pkey"
+DETAIL:  Key (test)=(3) already exists.
+insert into atacc1 (test2, test) values (1, NULL);
+ERROR:  null value in column "test" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, 1).
+drop table atacc1;
+-- alter table / alter column [set/drop] not null tests
+-- try altering system catalogs, should fail
+alter table pg_class alter column relname drop not null;
+ERROR:  permission denied: "pg_class" is a system catalog
+alter table pg_class alter relname set not null;
+ERROR:  permission denied: "pg_class" is a system catalog
+-- try altering non-existent table, should fail
+alter table non_existent alter column bar set not null;
+ERROR:  relation "non_existent" does not exist
+alter table non_existent alter column bar drop not null;
+ERROR:  relation "non_existent" does not exist
+-- test setting columns to null and not null and vice versa
+-- test checking for null values and primary key
+create table atacc1 (test int not null);
+alter table atacc1 add constraint "atacc1_pkey" primary key (test);
+alter table atacc1 alter column test drop not null;
+ERROR:  column "test" is in a primary key
+alter table atacc1 drop constraint "atacc1_pkey";
+alter table atacc1 alter column test drop not null;
+insert into atacc1 values (null);
+alter table atacc1 alter test set not null;
+ERROR:  column "test" of relation "atacc1" contains null values
+delete from atacc1;
+alter table atacc1 alter test set not null;
+-- try altering a non-existent column, should fail
+alter table atacc1 alter bar set not null;
+ERROR:  column "bar" of relation "atacc1" does not exist
+alter table atacc1 alter bar drop not null;
+ERROR:  column "bar" of relation "atacc1" does not exist
+-- try creating a view and altering that, should fail
+create view myview as select * from atacc1;
+alter table myview alter column test drop not null;
+ERROR:  "myview" is not a table or foreign table
+alter table myview alter column test set not null;
+ERROR:  "myview" is not a table or foreign table
+drop view myview;
+drop table atacc1;
+-- set not null verified by constraints
+create table atacc1 (test_a int, test_b int);
+insert into atacc1 values (null, 1);
+-- constraint not cover all values, should fail
+alter table atacc1 add constraint atacc1_constr_or check(test_a is not null or test_b < 10);
+alter table atacc1 alter test_a set not null;
+ERROR:  column "test_a" of relation "atacc1" contains null values
+alter table atacc1 drop constraint atacc1_constr_or;
+-- not valid constraint, should fail
+alter table atacc1 add constraint atacc1_constr_invalid check(test_a is not null) not valid;
+alter table atacc1 alter test_a set not null;
+ERROR:  column "test_a" of relation "atacc1" contains null values
+alter table atacc1 drop constraint atacc1_constr_invalid;
+-- with valid constraint
+update atacc1 set test_a = 1;
+alter table atacc1 add constraint atacc1_constr_a_valid check(test_a is not null);
+alter table atacc1 alter test_a set not null;
+delete from atacc1;
+insert into atacc1 values (2, null);
+alter table atacc1 alter test_a drop not null;
+-- test multiple set not null at same time
+-- test_a checked by atacc1_constr_a_valid, test_b should fail by table scan
+alter table atacc1 alter test_a set not null, alter test_b set not null;
+ERROR:  column "test_b" of relation "atacc1" contains null values
+-- commands order has no importance
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+ERROR:  column "test_b" of relation "atacc1" contains null values
+-- valid one by table scan, one by check constraints
+update atacc1 set test_b = 1;
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+alter table atacc1 alter test_a drop not null, alter test_b drop not null;
+-- both column has check constraints
+alter table atacc1 add constraint atacc1_constr_b_valid check(test_b is not null);
+alter table atacc1 alter test_b set not null, alter test_a set not null;
+drop table atacc1;
+-- test inheritance
+create table parent (a int);
+create table child (b varchar(255)) inherits (parent);
+alter table parent alter a set not null;
+insert into parent values (NULL);
+ERROR:  null value in column "a" of relation "parent" violates not-null constraint
+DETAIL:  Failing row contains (null).
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" of relation "child" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+alter table parent alter a drop not null;
+insert into parent values (NULL);
+insert into child (a, b) values (NULL, 'foo');
+alter table only parent alter a set not null;
+ERROR:  column "a" of relation "parent" contains null values
+alter table child alter a set not null;
+ERROR:  column "a" of relation "child" contains null values
+delete from parent;
+alter table only parent alter a set not null;
+insert into parent values (NULL);
+ERROR:  null value in column "a" of relation "parent" violates not-null constraint
+DETAIL:  Failing row contains (null).
+alter table child alter a set not null;
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" of relation "child" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+delete from child;
+alter table child alter a set not null;
+insert into child (a, b) values (NULL, 'foo');
+ERROR:  null value in column "a" of relation "child" violates not-null constraint
+DETAIL:  Failing row contains (null, foo).
+drop table child;
+drop table parent;
+-- test setting and removing default values
+create table def_test (
+	c1	int4 default 5,
+	c2	text default 'initial_default'
+);
+insert into def_test default values;
+alter table def_test alter column c1 drop default;
+insert into def_test default values;
+alter table def_test alter column c2 drop default;
+insert into def_test default values;
+alter table def_test alter column c1 set default 10;
+alter table def_test alter column c2 set default 'new_default';
+insert into def_test default values;
+select * from def_test;
+ c1 |       c2        
+----+-----------------
+  5 | initial_default
+    | initial_default
+    | 
+ 10 | new_default
+(4 rows)
+
+-- set defaults to an incorrect type: this should fail
+alter table def_test alter column c1 set default 'wrong_datatype';
+ERROR:  invalid input syntax for type integer: "wrong_datatype"
+alter table def_test alter column c2 set default 20;
+-- set defaults on a non-existent column: this should fail
+alter table def_test alter column c3 set default 30;
+ERROR:  column "c3" of relation "def_test" does not exist
+-- set defaults on views: we need to create a view, add a rule
+-- to allow insertions into it, and then alter the view to add
+-- a default
+create view def_view_test as select * from def_test;
+create rule def_view_test_ins as
+	on insert to def_view_test
+	do instead insert into def_test select new.*;
+insert into def_view_test default values;
+alter table def_view_test alter column c1 set default 45;
+insert into def_view_test default values;
+alter table def_view_test alter column c2 set default 'view_default';
+insert into def_view_test default values;
+select * from def_view_test;
+ c1 |       c2        
+----+-----------------
+  5 | initial_default
+    | initial_default
+    | 
+ 10 | new_default
+    | 
+ 45 | 
+ 45 | view_default
+(7 rows)
+
+drop rule def_view_test_ins on def_view_test;
+drop view def_view_test;
+drop table def_test;
+-- alter table / drop column tests
+-- try altering system catalogs, should fail
+alter table pg_class drop column relname;
+ERROR:  permission denied: "pg_class" is a system catalog
+-- try altering non-existent table, should fail
+alter table nosuchtable drop column bar;
+ERROR:  relation "nosuchtable" does not exist
+-- test dropping columns
+create table atacc1 (a int4 not null, b int4, c int4 not null, d int4);
+insert into atacc1 values (1, 2, 3, 4);
+alter table atacc1 drop a;
+alter table atacc1 drop a;
+ERROR:  column "a" of relation "atacc1" does not exist
+-- SELECTs
+select * from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select * from atacc1 order by a;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 order by a;
+                                      ^
+select * from atacc1 order by "........pg.dropped.1........";
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 order by "........pg.dropped.1........"...
+                                      ^
+select * from atacc1 group by a;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 group by a;
+                                      ^
+select * from atacc1 group by "........pg.dropped.1........";
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 group by "........pg.dropped.1........"...
+                                      ^
+select atacc1.* from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select a from atacc1;
+ERROR:  column "a" does not exist
+LINE 1: select a from atacc1;
+               ^
+select atacc1.a from atacc1;
+ERROR:  column atacc1.a does not exist
+LINE 1: select atacc1.a from atacc1;
+               ^
+select b,c,d from atacc1;
+ b | c | d 
+---+---+---
+ 2 | 3 | 4
+(1 row)
+
+select a,b,c,d from atacc1;
+ERROR:  column "a" does not exist
+LINE 1: select a,b,c,d from atacc1;
+               ^
+select * from atacc1 where a = 1;
+ERROR:  column "a" does not exist
+LINE 1: select * from atacc1 where a = 1;
+                                   ^
+select "........pg.dropped.1........" from atacc1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select "........pg.dropped.1........" from atacc1;
+               ^
+select atacc1."........pg.dropped.1........" from atacc1;
+ERROR:  column atacc1.........pg.dropped.1........ does not exist
+LINE 1: select atacc1."........pg.dropped.1........" from atacc1;
+               ^
+select "........pg.dropped.1........",b,c,d from atacc1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select "........pg.dropped.1........",b,c,d from atacc1;
+               ^
+select * from atacc1 where "........pg.dropped.1........" = 1;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: select * from atacc1 where "........pg.dropped.1........" = ...
+                                   ^
+-- UPDATEs
+update atacc1 set a = 3;
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: update atacc1 set a = 3;
+                          ^
+update atacc1 set b = 2 where a = 3;
+ERROR:  column "a" does not exist
+LINE 1: update atacc1 set b = 2 where a = 3;
+                                      ^
+update atacc1 set "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: update atacc1 set "........pg.dropped.1........" = 3;
+                          ^
+update atacc1 set b = 2 where "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: update atacc1 set b = 2 where "........pg.dropped.1........"...
+                                      ^
+-- INSERTs
+insert into atacc1 values (10, 11, 12, 13);
+ERROR:  INSERT has more expressions than target columns
+LINE 1: insert into atacc1 values (10, 11, 12, 13);
+                                               ^
+insert into atacc1 values (default, 11, 12, 13);
+ERROR:  INSERT has more expressions than target columns
+LINE 1: insert into atacc1 values (default, 11, 12, 13);
+                                                    ^
+insert into atacc1 values (11, 12, 13);
+insert into atacc1 (a) values (10);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a) values (10);
+                            ^
+insert into atacc1 (a) values (default);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a) values (default);
+                            ^
+insert into atacc1 (a,b,c,d) values (10,11,12,13);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a,b,c,d) values (10,11,12,13);
+                            ^
+insert into atacc1 (a,b,c,d) values (default,11,12,13);
+ERROR:  column "a" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 (a,b,c,d) values (default,11,12,13);
+                            ^
+insert into atacc1 (b,c,d) values (11,12,13);
+insert into atacc1 ("........pg.dropped.1........") values (10);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........") values (...
+                            ^
+insert into atacc1 ("........pg.dropped.1........") values (default);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........") values (...
+                            ^
+insert into atacc1 ("........pg.dropped.1........",b,c,d) values (10,11,12,13);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va...
+                            ^
+insert into atacc1 ("........pg.dropped.1........",b,c,d) values (default,11,12,13);
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va...
+                            ^
+-- DELETEs
+delete from atacc1 where a = 3;
+ERROR:  column "a" does not exist
+LINE 1: delete from atacc1 where a = 3;
+                                 ^
+delete from atacc1 where "........pg.dropped.1........" = 3;
+ERROR:  column "........pg.dropped.1........" does not exist
+LINE 1: delete from atacc1 where "........pg.dropped.1........" = 3;
+                                 ^
+delete from atacc1;
+-- try dropping a non-existent column, should fail
+alter table atacc1 drop bar;
+ERROR:  column "bar" of relation "atacc1" does not exist
+-- try removing an oid column, should succeed (as it's nonexistent)
+alter table atacc1 SET WITHOUT OIDS;
+-- try adding an oid column, should fail (not supported)
+alter table atacc1 SET WITH OIDS;
+ERROR:  syntax error at or near "WITH"
+LINE 1: alter table atacc1 SET WITH OIDS;
+                               ^
+-- try dropping the xmin column, should fail
+alter table atacc1 drop xmin;
+ERROR:  cannot drop system column "xmin"
+-- try creating a view and altering that, should fail
+create view myview as select * from atacc1;
+select * from myview;
+ b | c | d 
+---+---+---
+(0 rows)
+
+alter table myview drop d;
+ERROR:  "myview" is not a table, composite type, or foreign table
+drop view myview;
+-- test some commands to make sure they fail on the dropped column
+analyze atacc1(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+analyze atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+vacuum analyze atacc1(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+vacuum analyze atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+comment on column atacc1.a is 'testing';
+ERROR:  column "a" of relation "atacc1" does not exist
+comment on column atacc1."........pg.dropped.1........" is 'testing';
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set storage plain;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set storage plain;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set statistics 0;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set statistics 0;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set default 3;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set default 3;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a drop default;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" drop default;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a set not null;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" set not null;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 alter a drop not null;
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 alter "........pg.dropped.1........" drop not null;
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 rename a to x;
+ERROR:  column "a" does not exist
+alter table atacc1 rename "........pg.dropped.1........" to x;
+ERROR:  column "........pg.dropped.1........" does not exist
+alter table atacc1 add primary key(a);
+ERROR:  column "a" of relation "atacc1" does not exist
+alter table atacc1 add primary key("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" of relation "atacc1" does not exist
+alter table atacc1 add unique(a);
+ERROR:  column "a" named in key does not exist
+alter table atacc1 add unique("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" named in key does not exist
+alter table atacc1 add check (a > 3);
+ERROR:  column "a" does not exist
+alter table atacc1 add check ("........pg.dropped.1........" > 3);
+ERROR:  column "........pg.dropped.1........" does not exist
+create table atacc2 (id int4 unique);
+alter table atacc1 add foreign key (a) references atacc2(id);
+ERROR:  column "a" referenced in foreign key constraint does not exist
+alter table atacc1 add foreign key ("........pg.dropped.1........") references atacc2(id);
+ERROR:  column "........pg.dropped.1........" referenced in foreign key constraint does not exist
+alter table atacc2 add foreign key (id) references atacc1(a);
+ERROR:  column "a" referenced in foreign key constraint does not exist
+alter table atacc2 add foreign key (id) references atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" referenced in foreign key constraint does not exist
+drop table atacc2;
+create index "testing_idx" on atacc1(a);
+ERROR:  column "a" does not exist
+create index "testing_idx" on atacc1("........pg.dropped.1........");
+ERROR:  column "........pg.dropped.1........" does not exist
+-- test create as and select into
+insert into atacc1 values (21, 22, 23);
+create table attest1 as select * from atacc1;
+select * from attest1;
+ b  | c  | d  
+----+----+----
+ 21 | 22 | 23
+(1 row)
+
+drop table attest1;
+select * into attest2 from atacc1;
+select * from attest2;
+ b  | c  | d  
+----+----+----
+ 21 | 22 | 23
+(1 row)
+
+drop table attest2;
+-- try dropping all columns
+alter table atacc1 drop c;
+alter table atacc1 drop d;
+alter table atacc1 drop b;
+select * from atacc1;
+--
+(1 row)
+
+drop table atacc1;
+-- test constraint error reporting in presence of dropped columns
+create table atacc1 (id serial primary key, value int check (value < 10));
+insert into atacc1(value) values (100);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_value_check"
+DETAIL:  Failing row contains (1, 100).
+alter table atacc1 drop column value;
+alter table atacc1 add column value int check (value < 10);
+insert into atacc1(value) values (100);
+ERROR:  new row for relation "atacc1" violates check constraint "atacc1_value_check"
+DETAIL:  Failing row contains (2, 100).
+insert into atacc1(id, value) values (null, 0);
+ERROR:  null value in column "id" of relation "atacc1" violates not-null constraint
+DETAIL:  Failing row contains (null, 0).
+drop table atacc1;
+-- test inheritance
+create table parent (a int, b int, c int);
+insert into parent values (1, 2, 3);
+alter table parent drop a;
+create table child (d varchar(255)) inherits (parent);
+insert into child values (12, 13, 'testing');
+select * from parent;
+ b  | c  
+----+----
+  2 |  3
+ 12 | 13
+(2 rows)
+
+select * from child;
+ b  | c  |    d    
+----+----+---------
+ 12 | 13 | testing
+(1 row)
+
+alter table parent drop c;
+select * from parent;
+ b  
+----
+  2
+ 12
+(2 rows)
+
+select * from child;
+ b  |    d    
+----+---------
+ 12 | testing
+(1 row)
+
+drop table child;
+drop table parent;
+-- check error cases for inheritance column merging
+create table parent (a float8, b numeric(10,4), c text collate "C");
+create table child (a float4) inherits (parent); -- fail
+NOTICE:  merging column "a" with inherited definition
+ERROR:  column "a" has a type conflict
+DETAIL:  double precision versus real
+create table child (b decimal(10,7)) inherits (parent); -- fail
+NOTICE:  moving and merging column "b" with inherited definition
+DETAIL:  User-specified column moved to the position of the inherited column.
+ERROR:  column "b" has a type conflict
+DETAIL:  numeric(10,4) versus numeric(10,7)
+create table child (c text collate "POSIX") inherits (parent); -- fail
+NOTICE:  moving and merging column "c" with inherited definition
+DETAIL:  User-specified column moved to the position of the inherited column.
+ERROR:  column "c" has a collation conflict
+DETAIL:  "C" versus "POSIX"
+create table child (a double precision, b decimal(10,4)) inherits (parent);
+NOTICE:  merging column "a" with inherited definition
+NOTICE:  merging column "b" with inherited definition
+drop table child;
+drop table parent;
+-- test copy in/out
+create table attest (a int4, b int4, c int4);
+insert into attest values (1,2,3);
+alter table attest drop a;
+copy attest to stdout;
+2	3
+copy attest(a) to stdout;
+ERROR:  column "a" of relation "attest" does not exist
+copy attest("........pg.dropped.1........") to stdout;
+ERROR:  column "........pg.dropped.1........" of relation "attest" does not exist
+copy attest from stdin;
+ERROR:  extra data after last expected column
+CONTEXT:  COPY attest, line 1: "10	11	12"
+select * from attest;
+ b | c 
+---+---
+ 2 | 3
+(1 row)
+
+copy attest from stdin;
+select * from attest;
+ b  | c  
+----+----
+  2 |  3
+ 21 | 22
+(2 rows)
+
+copy attest(a) from stdin;
+ERROR:  column "a" of relation "attest" does not exist
+copy attest("........pg.dropped.1........") from stdin;
+ERROR:  column "........pg.dropped.1........" of relation "attest" does not exist
+copy attest(b,c) from stdin;
+select * from attest;
+ b  | c  
+----+----
+  2 |  3
+ 21 | 22
+ 31 | 32
+(3 rows)
+
+drop table attest;
+-- test inheritance
+create table dropColumn (a int, b int, e int);
+create table dropColumnChild (c int) inherits (dropColumn);
+create table dropColumnAnother (d int) inherits (dropColumnChild);
+-- these two should fail
+alter table dropColumnchild drop column a;
+ERROR:  cannot drop inherited column "a"
+alter table only dropColumnChild drop column b;
+ERROR:  cannot drop inherited column "b"
+-- these three should work
+alter table only dropColumn drop column e;
+alter table dropColumnChild drop column c;
+alter table dropColumn drop column a;
+create table renameColumn (a int);
+create table renameColumnChild (b int) inherits (renameColumn);
+create table renameColumnAnother (c int) inherits (renameColumnChild);
+-- these three should fail
+alter table renameColumnChild rename column a to d;
+ERROR:  cannot rename inherited column "a"
+alter table only renameColumnChild rename column a to d;
+ERROR:  inherited column "a" must be renamed in child tables too
+alter table only renameColumn rename column a to d;
+ERROR:  inherited column "a" must be renamed in child tables too
+-- these should work
+alter table renameColumn rename column a to d;
+alter table renameColumnChild rename column b to a;
+-- these should work
+alter table if exists doesnt_exist_tab rename column a to d;
+NOTICE:  relation "doesnt_exist_tab" does not exist, skipping
+alter table if exists doesnt_exist_tab rename column b to a;
+NOTICE:  relation "doesnt_exist_tab" does not exist, skipping
+-- this should work
+alter table renameColumn add column w int;
+-- this should fail
+alter table only renameColumn add column x int;
+ERROR:  column must be added to child tables too
+-- Test corner cases in dropping of inherited columns
+create table p1 (f1 int, f2 int);
+create table c1 (f1 int not null) inherits(p1);
+NOTICE:  merging column "f1" with inherited definition
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+-- should work
+alter table p1 drop column f1;
+-- c1.f1 is still there, but no longer inherited
+select f1 from c1;
+ f1 
+----
+(0 rows)
+
+alter table c1 drop column f1;
+select f1 from c1;
+ERROR:  column "f1" does not exist
+LINE 1: select f1 from c1;
+               ^
+HINT:  Perhaps you meant to reference the column "c1.f2".
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 () inherits(p1);
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table p1 drop column f1;
+-- c1.f1 is dropped now, since there is no local definition for it
+select f1 from c1;
+ERROR:  column "f1" does not exist
+LINE 1: select f1 from c1;
+               ^
+HINT:  Perhaps you meant to reference the column "c1.f2".
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 () inherits(p1);
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table only p1 drop column f1;
+-- c1.f1 is NOT dropped, but must now be considered non-inherited
+alter table c1 drop column f1;
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1 (f1 int, f2 int);
+create table c1 (f1 int not null) inherits(p1);
+NOTICE:  merging column "f1" with inherited definition
+-- should be rejected since c1.f1 is inherited
+alter table c1 drop column f1;
+ERROR:  cannot drop inherited column "f1"
+alter table only p1 drop column f1;
+-- c1.f1 is still there, but no longer inherited
+alter table c1 drop column f1;
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+create table p1(id int, name text);
+create table p2(id2 int, name text, height int);
+create table c1(age int) inherits(p1,p2);
+NOTICE:  merging multiple inherited definitions of column "name"
+create table gc1() inherits (c1);
+select relname, attname, attinhcount, attislocal
+from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid)
+where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped
+order by relname, attnum;
+ relname | attname | attinhcount | attislocal 
+---------+---------+-------------+------------
+ c1      | id      |           1 | f
+ c1      | name    |           2 | f
+ c1      | id2     |           1 | f
+ c1      | height  |           1 | f
+ c1      | age     |           0 | t
+ gc1     | id      |           1 | f
+ gc1     | name    |           1 | f
+ gc1     | id2     |           1 | f
+ gc1     | height  |           1 | f
+ gc1     | age     |           1 | f
+ p1      | id      |           0 | t
+ p1      | name    |           0 | t
+ p2      | id2     |           0 | t
+ p2      | name    |           0 | t
+ p2      | height  |           0 | t
+(15 rows)
+
+-- should work
+alter table only p1 drop column name;
+-- should work. Now c1.name is local and inhcount is 0.
+alter table p2 drop column name;
+-- should be rejected since its inherited
+alter table gc1 drop column name;
+ERROR:  cannot drop inherited column "name"
+-- should work, and drop gc1.name along
+alter table c1 drop column name;
+-- should fail: column does not exist
+alter table gc1 drop column name;
+ERROR:  column "name" of relation "gc1" does not exist
+-- should work and drop the attribute in all tables
+alter table p2 drop column height;
+-- IF EXISTS test
+create table dropColumnExists ();
+alter table dropColumnExists drop column non_existing; --fail
+ERROR:  column "non_existing" of relation "dropcolumnexists" does not exist
+alter table dropColumnExists drop column if exists non_existing; --succeed
+NOTICE:  column "non_existing" of relation "dropcolumnexists" does not exist, skipping
+select relname, attname, attinhcount, attislocal
+from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid)
+where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped
+order by relname, attnum;
+ relname | attname | attinhcount | attislocal 
+---------+---------+-------------+------------
+ c1      | id      |           1 | f
+ c1      | id2     |           1 | f
+ c1      | age     |           0 | t
+ gc1     | id      |           1 | f
+ gc1     | id2     |           1 | f
+ gc1     | age     |           1 | f
+ p1      | id      |           0 | t
+ p2      | id2     |           0 | t
+(8 rows)
+
+drop table p1, p2 cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table c1
+drop cascades to table gc1
+-- test attinhcount tracking with merged columns
+create table depth0();
+create table depth1(c text) inherits (depth0);
+create table depth2() inherits (depth1);
+alter table depth0 add c text;
+NOTICE:  merging definition of column "c" for child "depth1"
+select attrelid::regclass, attname, attinhcount, attislocal
+from pg_attribute
+where attnum > 0 and attrelid::regclass in ('depth0', 'depth1', 'depth2')
+order by attrelid::regclass::text, attnum;
+ attrelid | attname | attinhcount | attislocal 
+----------+---------+-------------+------------
+ depth0   | c       |           0 | t
+ depth1   | c       |           1 | t
+ depth2   | c       |           1 | f
+(3 rows)
+
+-- test renumbering of child-table columns in inherited operations
+create table p1 (f1 int);
+create table c1 (f2 text, f3 int) inherits (p1);
+alter table p1 add column a1 int check (a1 > 0);
+alter table p1 add column f2 text;
+NOTICE:  merging definition of column "f2" for child "c1"
+insert into p1 values (1,2,'abc');
+insert into c1 values(11,'xyz',33,0); -- should fail
+ERROR:  new row for relation "c1" violates check constraint "p1_a1_check"
+DETAIL:  Failing row contains (11, xyz, 33, 0).
+insert into c1 values(11,'xyz',33,22);
+select * from p1;
+ f1 | a1 | f2  
+----+----+-----
+  1 |  2 | abc
+ 11 | 22 | xyz
+(2 rows)
+
+update p1 set a1 = a1 + 1, f2 = upper(f2);
+select * from p1;
+ f1 | a1 | f2  
+----+----+-----
+  1 |  3 | ABC
+ 11 | 23 | XYZ
+(2 rows)
+
+drop table p1 cascade;
+NOTICE:  drop cascades to table c1
+-- test that operations with a dropped column do not try to reference
+-- its datatype
+create domain mytype as text;
+create temp table foo (f1 text, f2 mytype, f3 text);
+insert into foo values('bb','cc','dd');
+select * from foo;
+ f1 | f2 | f3 
+----+----+----
+ bb | cc | dd
+(1 row)
+
+drop domain mytype cascade;
+NOTICE:  drop cascades to column f2 of table foo
+select * from foo;
+ f1 | f3 
+----+----
+ bb | dd
+(1 row)
+
+insert into foo values('qq','rr');
+select * from foo;
+ f1 | f3 
+----+----
+ bb | dd
+ qq | rr
+(2 rows)
+
+update foo set f3 = 'zz';
+select * from foo;
+ f1 | f3 
+----+----
+ bb | zz
+ qq | zz
+(2 rows)
+
+select f3,max(f1) from foo group by f3;
+ f3 | max 
+----+-----
+ zz | qq
+(1 row)
+
+-- Simple tests for alter table column type
+alter table foo alter f1 TYPE integer; -- fails
+ERROR:  column "f1" cannot be cast automatically to type integer
+HINT:  You might need to specify "USING f1::integer".
+alter table foo alter f1 TYPE varchar(10);
+create table anothertab (atcol1 serial8, atcol2 boolean,
+	constraint anothertab_chk check (atcol1 <= 3));
+insert into anothertab (atcol1, atcol2) values (default, true);
+insert into anothertab (atcol1, atcol2) values (default, false);
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+(2 rows)
+
+alter table anothertab alter column atcol1 type boolean; -- fails
+ERROR:  column "atcol1" cannot be cast automatically to type boolean
+HINT:  You might need to specify "USING atcol1::boolean".
+alter table anothertab alter column atcol1 type boolean using atcol1::int; -- fails
+ERROR:  result of USING clause for column "atcol1" cannot be cast automatically to type boolean
+HINT:  You might need to add an explicit cast.
+alter table anothertab alter column atcol1 type integer;
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+(2 rows)
+
+insert into anothertab (atcol1, atcol2) values (45, null); -- fails
+ERROR:  new row for relation "anothertab" violates check constraint "anothertab_chk"
+DETAIL:  Failing row contains (45, null).
+insert into anothertab (atcol1, atcol2) values (default, null);
+select * from anothertab;
+ atcol1 | atcol2 
+--------+--------
+      1 | t
+      2 | f
+      3 | 
+(3 rows)
+
+alter table anothertab alter column atcol2 type text
+      using case when atcol2 is true then 'IT WAS TRUE'
+                 when atcol2 is false then 'IT WAS FALSE'
+                 else 'IT WAS NULL!' end;
+select * from anothertab;
+ atcol1 |    atcol2    
+--------+--------------
+      1 | IT WAS TRUE
+      2 | IT WAS FALSE
+      3 | IT WAS NULL!
+(3 rows)
+
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end; -- fails
+ERROR:  default for column "atcol1" cannot be cast automatically to type boolean
+alter table anothertab alter column atcol1 drop default;
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end; -- fails
+ERROR:  operator does not exist: boolean <= integer
+HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.
+alter table anothertab drop constraint anothertab_chk;
+alter table anothertab drop constraint anothertab_chk; -- fails
+ERROR:  constraint "anothertab_chk" of relation "anothertab" does not exist
+alter table anothertab drop constraint IF EXISTS anothertab_chk; -- succeeds
+NOTICE:  constraint "anothertab_chk" of relation "anothertab" does not exist, skipping
+alter table anothertab alter column atcol1 type boolean
+        using case when atcol1 % 2 = 0 then true else false end;
+select * from anothertab;
+ atcol1 |    atcol2    
+--------+--------------
+ f      | IT WAS TRUE
+ t      | IT WAS FALSE
+ f      | IT WAS NULL!
+(3 rows)
+
+drop table anothertab;
+-- Test index handling in alter table column type (cf. bugs #15835, #15865)
+create table anothertab(f1 int primary key, f2 int unique,
+                        f3 int, f4 int, f5 int);
+alter table anothertab
+  add exclude using btree (f3 with =);
+alter table anothertab
+  add exclude using btree (f4 with =) where (f4 is not null);
+alter table anothertab
+  add exclude using btree (f4 with =) where (f5 > 0);
+alter table anothertab
+  add unique(f1,f4);
+create index on anothertab(f2,f3);
+create unique index on anothertab(f4);
+\d anothertab
+             Table "public.anothertab"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ f1     | integer |           | not null | 
+ f2     | integer |           |          | 
+ f3     | integer |           |          | 
+ f4     | integer |           |          | 
+ f5     | integer |           |          | 
+Indexes:
+    "anothertab_pkey" PRIMARY KEY, btree (f1)
+    "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4)
+    "anothertab_f2_f3_idx" btree (f2, f3)
+    "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2)
+    "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =)
+    "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL)
+    "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0)
+    "anothertab_f4_idx" UNIQUE, btree (f4)
+
+alter table anothertab alter column f1 type bigint;
+alter table anothertab
+  alter column f2 type bigint,
+  alter column f3 type bigint,
+  alter column f4 type bigint;
+alter table anothertab alter column f5 type bigint;
+\d anothertab
+            Table "public.anothertab"
+ Column |  Type  | Collation | Nullable | Default 
+--------+--------+-----------+----------+---------
+ f1     | bigint |           | not null | 
+ f2     | bigint |           |          | 
+ f3     | bigint |           |          | 
+ f4     | bigint |           |          | 
+ f5     | bigint |           |          | 
+Indexes:
+    "anothertab_pkey" PRIMARY KEY, btree (f1)
+    "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4)
+    "anothertab_f2_f3_idx" btree (f2, f3)
+    "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2)
+    "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =)
+    "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL)
+    "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0)
+    "anothertab_f4_idx" UNIQUE, btree (f4)
+
+drop table anothertab;
+-- test that USING expressions are parsed before column alter type / drop steps
+create table another (f1 int, f2 text, f3 text);
+insert into another values(1, 'one', 'uno');
+insert into another values(2, 'two', 'due');
+insert into another values(3, 'three', 'tre');
+select * from another;
+ f1 |  f2   | f3  
+----+-------+-----
+  1 | one   | uno
+  2 | two   | due
+  3 | three | tre
+(3 rows)
+
+alter table another
+  alter f1 type text using f2 || ' and ' || f3 || ' more',
+  alter f2 type bigint using f1 * 10,
+  drop column f3;
+select * from another;
+         f1         | f2 
+--------------------+----
+ one and uno more   | 10
+ two and due more   | 20
+ three and tre more | 30
+(3 rows)
+
+drop table another;
+-- Create an index that skips WAL, then perform a SET DATA TYPE that skips
+-- rewriting the index.
+begin;
+create table skip_wal_skip_rewrite_index (c varchar(10) primary key);
+alter table skip_wal_skip_rewrite_index alter c type varchar(20);
+commit;
+-- table's row type
+create table tab1 (a int, b text);
+create table tab2 (x int, y tab1);
+alter table tab1 alter column b type varchar; -- fails
+ERROR:  cannot alter table "tab1" because column "tab2.y" uses its row type
+-- Alter column type that's part of a partitioned index
+create table at_partitioned (a int, b text) partition by range (a);
+create table at_part_1 partition of at_partitioned for values from (0) to (1000);
+insert into at_partitioned values (512, '0.123');
+create table at_part_2 (b text, a int);
+insert into at_part_2 values ('1.234', 1024);
+create index on at_partitioned (b);
+create index on at_partitioned (a);
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+
+alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000);
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+alter table at_partitioned alter column b type numeric using b::numeric;
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | numeric |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | numeric |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+drop table at_partitioned;
+-- Alter column type when no table rewrite is required
+-- Also check that comments are preserved
+create table at_partitioned(id int, name varchar(64), unique (id, name))
+  partition by hash(id);
+comment on constraint at_partitioned_id_name_key on at_partitioned is 'parent constraint';
+comment on index at_partitioned_id_name_key is 'parent index';
+create table at_partitioned_0 partition of at_partitioned
+  for values with (modulus 2, remainder 0);
+comment on constraint at_partitioned_0_id_name_key on at_partitioned_0 is 'child 0 constraint';
+comment on index at_partitioned_0_id_name_key is 'child 0 index';
+create table at_partitioned_1 partition of at_partitioned
+  for values with (modulus 2, remainder 1);
+comment on constraint at_partitioned_1_id_name_key on at_partitioned_1 is 'child 1 constraint';
+comment on index at_partitioned_1_id_name_key is 'child 1 index';
+insert into at_partitioned values(1, 'foo');
+insert into at_partitioned values(3, 'bar');
+create temp table old_oids as
+  select relname, oid as oldoid, relfilenode as oldfilenode
+  from pg_class where relname like 'at_partitioned%';
+select relname,
+  c.oid = oldoid as orig_oid,
+  case relfilenode
+    when 0 then 'none'
+    when c.oid then 'own'
+    when oldfilenode then 'orig'
+    else 'OTHER'
+    end as storage,
+  obj_description(c.oid, 'pg_class') as desc
+  from pg_class c left join old_oids using (relname)
+  where relname like 'at_partitioned%'
+  order by relname;
+           relname            | orig_oid | storage |     desc      
+------------------------------+----------+---------+---------------
+ at_partitioned               | t        | none    | 
+ at_partitioned_0             | t        | own     | 
+ at_partitioned_0_id_name_key | t        | own     | child 0 index
+ at_partitioned_1             | t        | own     | 
+ at_partitioned_1_id_name_key | t        | own     | child 1 index
+ at_partitioned_id_name_key   | t        | none    | parent index
+(6 rows)
+
+select conname, obj_description(oid, 'pg_constraint') as desc
+  from pg_constraint where conname like 'at_partitioned%'
+  order by conname;
+           conname            |        desc        
+------------------------------+--------------------
+ at_partitioned_0_id_name_key | child 0 constraint
+ at_partitioned_1_id_name_key | child 1 constraint
+ at_partitioned_id_name_key   | parent constraint
+(3 rows)
+
+alter table at_partitioned alter column name type varchar(127);
+-- Note: these tests currently show the wrong behavior for comments :-(
+select relname,
+  c.oid = oldoid as orig_oid,
+  case relfilenode
+    when 0 then 'none'
+    when c.oid then 'own'
+    when oldfilenode then 'orig'
+    else 'OTHER'
+    end as storage,
+  obj_description(c.oid, 'pg_class') as desc
+  from pg_class c left join old_oids using (relname)
+  where relname like 'at_partitioned%'
+  order by relname;
+           relname            | orig_oid | storage |     desc     
+------------------------------+----------+---------+--------------
+ at_partitioned               | t        | none    | 
+ at_partitioned_0             | t        | own     | 
+ at_partitioned_0_id_name_key | f        | own     | parent index
+ at_partitioned_1             | t        | own     | 
+ at_partitioned_1_id_name_key | f        | own     | parent index
+ at_partitioned_id_name_key   | f        | none    | parent index
+(6 rows)
+
+select conname, obj_description(oid, 'pg_constraint') as desc
+  from pg_constraint where conname like 'at_partitioned%'
+  order by conname;
+           conname            |       desc        
+------------------------------+-------------------
+ at_partitioned_0_id_name_key | 
+ at_partitioned_1_id_name_key | 
+ at_partitioned_id_name_key   | parent constraint
+(3 rows)
+
+-- Don't remove this DROP, it exposes bug #15672
+drop table at_partitioned;
+-- disallow recursive containment of row types
+create temp table recur1 (f1 int);
+alter table recur1 add column f2 recur1; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+alter table recur1 add column f2 recur1[]; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+create domain array_of_recur1 as recur1[];
+alter table recur1 add column f2 array_of_recur1; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+create temp table recur2 (f1 int, f2 recur1);
+alter table recur1 add column f2 recur2; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+alter table recur1 add column f2 int;
+alter table recur1 alter column f2 type recur2; -- fails
+ERROR:  composite type recur1 cannot be made a member of itself
+-- SET STORAGE may need to add a TOAST table
+create table test_storage (a text);
+alter table test_storage alter a set storage plain;
+alter table test_storage add b int default 0; -- rewrite table to remove its TOAST table
+alter table test_storage alter a set storage extended; -- re-add TOAST table
+select reltoastrelid <> 0 as has_toast_table
+from pg_class
+where oid = 'test_storage'::regclass;
+ has_toast_table 
+-----------------
+ t
+(1 row)
+
+-- test that SET STORAGE propagates to index correctly
+create index test_storage_idx on test_storage (b, a);
+alter table test_storage alter column a set storage external;
+\d+ test_storage
+                                Table "public.test_storage"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | external |              | 
+ b      | integer |           |          | 0       | plain    |              | 
+Indexes:
+    "test_storage_idx" btree (b, a)
+
+\d+ test_storage_idx
+                Index "public.test_storage_idx"
+ Column |  Type   | Key? | Definition | Storage  | Stats target 
+--------+---------+------+------------+----------+--------------
+ b      | integer | yes  | b          | plain    | 
+ a      | text    | yes  | a          | external | 
+btree, for table "public.test_storage"
+
+-- ALTER COLUMN TYPE with a check constraint and a child table (bug #13779)
+CREATE TABLE test_inh_check (a float check (a > 10.2), b float);
+CREATE TABLE test_inh_check_child() INHERITS(test_inh_check);
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | double precision |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | double precision |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(2 rows)
+
+ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric;
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(2 rows)
+
+-- also try noinherit, local, and local+inherited cases
+ALTER TABLE test_inh_check ADD CONSTRAINT bnoinherit CHECK (b > 100) NO INHERIT;
+ALTER TABLE test_inh_check_child ADD CONSTRAINT blocal CHECK (b < 1000);
+ALTER TABLE test_inh_check_child ADD CONSTRAINT bmerged CHECK (b > 1);
+ALTER TABLE test_inh_check ADD CONSTRAINT bmerged CHECK (b > 1);
+NOTICE:  merging constraint "bmerged" with inherited definition
+\d test_inh_check
+               Table "public.test_inh_check"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "bmerged" CHECK (b > 1::double precision)
+    "bnoinherit" CHECK (b > 100::double precision) NO INHERIT
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+            Table "public.test_inh_check_child"
+ Column |       Type       | Collation | Nullable | Default 
+--------+------------------+-----------+----------+---------
+ a      | numeric          |           |          | 
+ b      | double precision |           |          | 
+Check constraints:
+    "blocal" CHECK (b < 1000::double precision)
+    "bmerged" CHECK (b > 1::double precision)
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | bmerged                |           0 | t          | f
+ test_inh_check       | bnoinherit             |           0 | t          | t
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | blocal                 |           0 | t          | f
+ test_inh_check_child | bmerged                |           1 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(6 rows)
+
+ALTER TABLE test_inh_check ALTER COLUMN b TYPE numeric;
+NOTICE:  merging constraint "bmerged" with inherited definition
+\d test_inh_check
+           Table "public.test_inh_check"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | numeric |           |          | 
+ b      | numeric |           |          | 
+Check constraints:
+    "bmerged" CHECK (b::double precision > 1::double precision)
+    "bnoinherit" CHECK (b::double precision > 100::double precision) NO INHERIT
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Number of child tables: 1 (Use \d+ to list them.)
+
+\d test_inh_check_child
+        Table "public.test_inh_check_child"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | numeric |           |          | 
+ b      | numeric |           |          | 
+Check constraints:
+    "blocal" CHECK (b::double precision < 1000::double precision)
+    "bmerged" CHECK (b::double precision > 1::double precision)
+    "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision)
+Inherits: test_inh_check
+
+select relname, conname, coninhcount, conislocal, connoinherit
+  from pg_constraint c, pg_class r
+  where relname like 'test_inh_check%' and c.conrelid = r.oid
+  order by 1, 2;
+       relname        |        conname         | coninhcount | conislocal | connoinherit 
+----------------------+------------------------+-------------+------------+--------------
+ test_inh_check       | bmerged                |           0 | t          | f
+ test_inh_check       | bnoinherit             |           0 | t          | t
+ test_inh_check       | test_inh_check_a_check |           0 | t          | f
+ test_inh_check_child | blocal                 |           0 | t          | f
+ test_inh_check_child | bmerged                |           1 | t          | f
+ test_inh_check_child | test_inh_check_a_check |           1 | f          | f
+(6 rows)
+
+-- ALTER COLUMN TYPE with different schema in children
+-- Bug at https://postgr.es/m/20170102225618.GA10071@telsasoft.com
+CREATE TABLE test_type_diff (f1 int);
+CREATE TABLE test_type_diff_c (extra smallint) INHERITS (test_type_diff);
+ALTER TABLE test_type_diff ADD COLUMN f2 int;
+INSERT INTO test_type_diff_c VALUES (1, 2, 3);
+ALTER TABLE test_type_diff ALTER COLUMN f2 TYPE bigint USING f2::bigint;
+CREATE TABLE test_type_diff2 (int_two int2, int_four int4, int_eight int8);
+CREATE TABLE test_type_diff2_c1 (int_four int4, int_eight int8, int_two int2);
+CREATE TABLE test_type_diff2_c2 (int_eight int8, int_two int2, int_four int4);
+CREATE TABLE test_type_diff2_c3 (int_two int2, int_four int4, int_eight int8);
+ALTER TABLE test_type_diff2_c1 INHERIT test_type_diff2;
+ALTER TABLE test_type_diff2_c2 INHERIT test_type_diff2;
+ALTER TABLE test_type_diff2_c3 INHERIT test_type_diff2;
+INSERT INTO test_type_diff2_c1 VALUES (1, 2, 3);
+INSERT INTO test_type_diff2_c2 VALUES (4, 5, 6);
+INSERT INTO test_type_diff2_c3 VALUES (7, 8, 9);
+ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int8 USING int_four::int8;
+-- whole-row references are disallowed
+ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int4 USING (pg_column_size(test_type_diff2));
+ERROR:  cannot convert whole-row table reference
+DETAIL:  USING expression contains a whole-row table reference.
+-- check for rollback of ANALYZE corrupting table property flags (bug #11638)
+CREATE TABLE check_fk_presence_1 (id int PRIMARY KEY, t text);
+CREATE TABLE check_fk_presence_2 (id int REFERENCES check_fk_presence_1, t text);
+BEGIN;
+ALTER TABLE check_fk_presence_2 DROP CONSTRAINT check_fk_presence_2_id_fkey;
+ANALYZE check_fk_presence_2;
+ROLLBACK;
+\d check_fk_presence_2
+        Table "public.check_fk_presence_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ t      | text    |           |          | 
+Foreign-key constraints:
+    "check_fk_presence_2_id_fkey" FOREIGN KEY (id) REFERENCES check_fk_presence_1(id)
+
+DROP TABLE check_fk_presence_1, check_fk_presence_2;
+-- check column addition within a view (bug #14876)
+create table at_base_table(id int, stuff text);
+insert into at_base_table values (23, 'skidoo');
+create view at_view_1 as select * from at_base_table bt;
+create view at_view_2 as select *, to_json(v1) as j from at_view_1 v1;
+\d+ at_view_1
+                          View "public.at_view_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+View definition:
+ SELECT bt.id,
+    bt.stuff
+   FROM at_base_table bt;
+
+\d+ at_view_2
+                          View "public.at_view_2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ j      | json    |           |          |         | extended | 
+View definition:
+ SELECT v1.id,
+    v1.stuff,
+    to_json(v1.*) AS j
+   FROM at_view_1 v1;
+
+explain (verbose, costs off) select * from at_view_2;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Seq Scan on public.at_base_table bt
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff))
+(2 rows)
+
+select * from at_view_2;
+ id | stuff  |             j              
+----+--------+----------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo"}
+(1 row)
+
+create or replace view at_view_1 as select *, 2+2 as more from at_base_table bt;
+\d+ at_view_1
+                          View "public.at_view_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ more   | integer |           |          |         | plain    | 
+View definition:
+ SELECT bt.id,
+    bt.stuff,
+    2 + 2 AS more
+   FROM at_base_table bt;
+
+\d+ at_view_2
+                          View "public.at_view_2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Description 
+--------+---------+-----------+----------+---------+----------+-------------
+ id     | integer |           |          |         | plain    | 
+ stuff  | text    |           |          |         | extended | 
+ j      | json    |           |          |         | extended | 
+View definition:
+ SELECT v1.id,
+    v1.stuff,
+    to_json(v1.*) AS j
+   FROM at_view_1 v1;
+
+explain (verbose, costs off) select * from at_view_2;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Seq Scan on public.at_base_table bt
+   Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL))
+(2 rows)
+
+select * from at_view_2;
+ id | stuff  |                   j                    
+----+--------+----------------------------------------
+ 23 | skidoo | {"id":23,"stuff":"skidoo","more":null}
+(1 row)
+
+drop view at_view_2;
+drop view at_view_1;
+drop table at_base_table;
+-- check adding a column not iself requiring a rewrite, together with
+-- a column requiring a default (bug #16038)
+-- ensure that rewrites aren't silently optimized away, removing the
+-- value of the test
+CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text)
+RETURNS boolean
+LANGUAGE plpgsql AS $$
+DECLARE
+    v_relfilenode oid;
+BEGIN
+    v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename;
+
+    EXECUTE p_ddl;
+
+    RETURN v_relfilenode <> (SELECT relfilenode FROM pg_class WHERE oid = p_tablename);
+END;
+$$;
+CREATE TABLE rewrite_test(col text);
+INSERT INTO rewrite_test VALUES ('something');
+INSERT INTO rewrite_test VALUES (NULL);
+-- empty[12] don't need rewrite, but notempty[12]_rewrite will force one
+SELECT check_ddl_rewrite('rewrite_test', $$
+  ALTER TABLE rewrite_test
+      ADD COLUMN empty1 text,
+      ADD COLUMN notempty1_rewrite serial;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN notempty2_rewrite serial,
+        ADD COLUMN empty2 text;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+-- also check that fast defaults cause no problem, first without rewrite
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN empty3 text,
+        ADD COLUMN notempty3_norewrite int default 42;
+$$);
+ check_ddl_rewrite 
+-------------------
+ f
+(1 row)
+
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN notempty4_norewrite int default 42,
+        ADD COLUMN empty4 text;
+$$);
+ check_ddl_rewrite 
+-------------------
+ f
+(1 row)
+
+-- then with rewrite
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN empty5 text,
+        ADD COLUMN notempty5_norewrite int default 42,
+        ADD COLUMN notempty5_rewrite serial;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+SELECT check_ddl_rewrite('rewrite_test', $$
+    ALTER TABLE rewrite_test
+        ADD COLUMN notempty6_rewrite serial,
+        ADD COLUMN empty6 text,
+        ADD COLUMN notempty6_norewrite int default 42;
+$$);
+ check_ddl_rewrite 
+-------------------
+ t
+(1 row)
+
+-- cleanup
+DROP FUNCTION check_ddl_rewrite(regclass, text);
+DROP TABLE rewrite_test;
+--
+-- lock levels
+--
+drop type lockmodes;
+ERROR:  type "lockmodes" does not exist
+create type lockmodes as enum (
+ 'SIReadLock'
+,'AccessShareLock'
+,'RowShareLock'
+,'RowExclusiveLock'
+,'ShareUpdateExclusiveLock'
+,'ShareLock'
+,'ShareRowExclusiveLock'
+,'ExclusiveLock'
+,'AccessExclusiveLock'
+);
+drop view my_locks;
+ERROR:  view "my_locks" does not exist
+create or replace view my_locks as
+select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode
+from pg_locks l join pg_class c on l.relation = c.oid
+where virtualtransaction = (
+        select virtualtransaction
+        from pg_locks
+        where transactionid = pg_current_xact_id()::xid)
+and locktype = 'relation'
+and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog')
+and c.relname != 'my_locks'
+group by c.relname;
+create table alterlock (f1 int primary key, f2 text);
+insert into alterlock values (1, 'foo');
+create table alterlock2 (f3 int primary key, f1 int);
+insert into alterlock2 values (1, 1);
+begin; alter table alterlock alter column f2 set statistics 150;
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+rollback;
+begin; alter table alterlock cluster on alterlock_pkey;
+select * from my_locks order by 1;
+    relname     |       max_lockmode       
+----------------+--------------------------
+ alterlock      | ShareUpdateExclusiveLock
+ alterlock_pkey | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set without cluster;
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+commit;
+begin; alter table alterlock set (fillfactor = 100);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock reset (fillfactor);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set (toast.autovacuum_enabled = off);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock set (autovacuum_enabled = off);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock alter column f2 set (n_distinct = 1);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+(1 row)
+
+rollback;
+-- test that mixing options with different lock levels works as expected
+begin; alter table alterlock set (autovacuum_enabled = off, fillfactor = 80);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock alter column f2 set storage extended;
+select * from my_locks order by 1;
+  relname  |    max_lockmode     
+-----------+---------------------
+ alterlock | AccessExclusiveLock
+(1 row)
+
+rollback;
+begin; alter table alterlock alter column f2 set default 'x';
+select * from my_locks order by 1;
+  relname  |    max_lockmode     
+-----------+---------------------
+ alterlock | AccessExclusiveLock
+(1 row)
+
+rollback;
+begin;
+create trigger ttdummy
+	before delete or update on alterlock
+	for each row
+	execute procedure
+	ttdummy (1, 1);
+select * from my_locks order by 1;
+  relname  |     max_lockmode      
+-----------+-----------------------
+ alterlock | ShareRowExclusiveLock
+(1 row)
+
+rollback;
+begin;
+select * from my_locks order by 1;
+ relname | max_lockmode 
+---------+--------------
+(0 rows)
+
+alter table alterlock2 add foreign key (f1) references alterlock (f1);
+select * from my_locks order by 1;
+     relname     |     max_lockmode      
+-----------------+-----------------------
+ alterlock       | ShareRowExclusiveLock
+ alterlock2      | ShareRowExclusiveLock
+ alterlock2_pkey | AccessShareLock
+ alterlock_pkey  | AccessShareLock
+(4 rows)
+
+rollback;
+begin;
+alter table alterlock2
+add constraint alterlock2nv foreign key (f1) references alterlock (f1) NOT VALID;
+select * from my_locks order by 1;
+  relname   |     max_lockmode      
+------------+-----------------------
+ alterlock  | ShareRowExclusiveLock
+ alterlock2 | ShareRowExclusiveLock
+(2 rows)
+
+commit;
+begin;
+alter table alterlock2 validate constraint alterlock2nv;
+select * from my_locks order by 1;
+     relname     |       max_lockmode       
+-----------------+--------------------------
+ alterlock       | RowShareLock
+ alterlock2      | ShareUpdateExclusiveLock
+ alterlock2_pkey | AccessShareLock
+ alterlock_pkey  | AccessShareLock
+(4 rows)
+
+rollback;
+create or replace view my_locks as
+select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode
+from pg_locks l join pg_class c on l.relation = c.oid
+where virtualtransaction = (
+        select virtualtransaction
+        from pg_locks
+        where transactionid = pg_current_xact_id()::xid)
+and locktype = 'relation'
+and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog')
+and c.relname = 'my_locks'
+group by c.relname;
+-- raise exception
+alter table my_locks set (autovacuum_enabled = false);
+ERROR:  unrecognized parameter "autovacuum_enabled"
+alter view my_locks set (autovacuum_enabled = false);
+ERROR:  unrecognized parameter "autovacuum_enabled"
+alter table my_locks reset (autovacuum_enabled);
+alter view my_locks reset (autovacuum_enabled);
+begin;
+alter view my_locks set (security_barrier=off);
+select * from my_locks order by 1;
+ relname  |    max_lockmode     
+----------+---------------------
+ my_locks | AccessExclusiveLock
+(1 row)
+
+alter view my_locks reset (security_barrier);
+rollback;
+-- this test intentionally applies the ALTER TABLE command against a view, but
+-- uses a view option so we expect this to succeed. This form of SQL is
+-- accepted for historical reasons, as shown in the docs for ALTER VIEW
+begin;
+alter table my_locks set (security_barrier=off);
+select * from my_locks order by 1;
+ relname  |    max_lockmode     
+----------+---------------------
+ my_locks | AccessExclusiveLock
+(1 row)
+
+alter table my_locks reset (security_barrier);
+rollback;
+-- cleanup
+drop table alterlock2;
+drop table alterlock;
+drop view my_locks;
+drop type lockmodes;
+--
+-- alter function
+--
+create function test_strict(text) returns text as
+    'select coalesce($1, ''got passed a null'');'
+    language sql returns null on null input;
+select test_strict(NULL);
+ test_strict 
+-------------
+ 
+(1 row)
+
+alter function test_strict(text) called on null input;
+select test_strict(NULL);
+    test_strict    
+-------------------
+ got passed a null
+(1 row)
+
+create function non_strict(text) returns text as
+    'select coalesce($1, ''got passed a null'');'
+    language sql called on null input;
+select non_strict(NULL);
+    non_strict     
+-------------------
+ got passed a null
+(1 row)
+
+alter function non_strict(text) returns null on null input;
+select non_strict(NULL);
+ non_strict 
+------------
+ 
+(1 row)
+
+--
+-- alter object set schema
+--
+create schema alter1;
+create schema alter2;
+create table alter1.t1(f1 serial primary key, f2 int check (f2 > 0));
+create view alter1.v1 as select * from alter1.t1;
+create function alter1.plus1(int) returns int as 'select $1+1' language sql;
+create domain alter1.posint integer check (value > 0);
+create type alter1.ctype as (f1 int, f2 text);
+create function alter1.same(alter1.ctype, alter1.ctype) returns boolean language sql
+as 'select $1.f1 is not distinct from $2.f1 and $1.f2 is not distinct from $2.f2';
+create operator alter1.=(procedure = alter1.same, leftarg  = alter1.ctype, rightarg = alter1.ctype);
+create operator class alter1.ctype_hash_ops default for type alter1.ctype using hash as
+  operator 1 alter1.=(alter1.ctype, alter1.ctype);
+create conversion alter1.latin1_to_utf8 for 'latin1' to 'utf8' from iso8859_1_to_utf8;
+create text search parser alter1.prs(start = prsd_start, gettoken = prsd_nexttoken, end = prsd_end, lextypes = prsd_lextype);
+create text search configuration alter1.cfg(parser = alter1.prs);
+create text search template alter1.tmpl(init = dsimple_init, lexize = dsimple_lexize);
+create text search dictionary alter1.dict(template = alter1.tmpl);
+insert into alter1.t1(f2) values(11);
+insert into alter1.t1(f2) values(12);
+alter table alter1.t1 set schema alter1; -- no-op, same schema
+alter table alter1.t1 set schema alter2;
+alter table alter1.v1 set schema alter2;
+alter function alter1.plus1(int) set schema alter2;
+alter domain alter1.posint set schema alter2;
+alter operator class alter1.ctype_hash_ops using hash set schema alter2;
+alter operator family alter1.ctype_hash_ops using hash set schema alter2;
+alter operator alter1.=(alter1.ctype, alter1.ctype) set schema alter2;
+alter function alter1.same(alter1.ctype, alter1.ctype) set schema alter2;
+alter type alter1.ctype set schema alter1; -- no-op, same schema
+alter type alter1.ctype set schema alter2;
+alter conversion alter1.latin1_to_utf8 set schema alter2;
+alter text search parser alter1.prs set schema alter2;
+alter text search configuration alter1.cfg set schema alter2;
+alter text search template alter1.tmpl set schema alter2;
+alter text search dictionary alter1.dict set schema alter2;
+-- this should succeed because nothing is left in alter1
+drop schema alter1;
+insert into alter2.t1(f2) values(13);
+insert into alter2.t1(f2) values(14);
+select * from alter2.t1;
+ f1 | f2 
+----+----
+  1 | 11
+  2 | 12
+  3 | 13
+  4 | 14
+(4 rows)
+
+select * from alter2.v1;
+ f1 | f2 
+----+----
+  1 | 11
+  2 | 12
+  3 | 13
+  4 | 14
+(4 rows)
+
+select alter2.plus1(41);
+ plus1 
+-------
+    42
+(1 row)
+
+-- clean up
+drop schema alter2 cascade;
+NOTICE:  drop cascades to 13 other objects
+DETAIL:  drop cascades to table alter2.t1
+drop cascades to view alter2.v1
+drop cascades to function alter2.plus1(integer)
+drop cascades to type alter2.posint
+drop cascades to type alter2.ctype
+drop cascades to function alter2.same(alter2.ctype,alter2.ctype)
+drop cascades to operator alter2.=(alter2.ctype,alter2.ctype)
+drop cascades to operator family alter2.ctype_hash_ops for access method hash
+drop cascades to conversion alter2.latin1_to_utf8
+drop cascades to text search parser alter2.prs
+drop cascades to text search configuration alter2.cfg
+drop cascades to text search template alter2.tmpl
+drop cascades to text search dictionary alter2.dict
+--
+-- composite types
+--
+CREATE TYPE test_type AS (a int);
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+
+ALTER TYPE nosuchtype ADD ATTRIBUTE b text; -- fails
+ERROR:  relation "nosuchtype" does not exist
+ALTER TYPE test_type ADD ATTRIBUTE b text;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+
+ALTER TYPE test_type ADD ATTRIBUTE b text; -- fails
+ERROR:  column "b" of relation "test_type" already exists
+ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE varchar;
+\d test_type
+              Composite type "public.test_type"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+
+ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE integer;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+
+ALTER TYPE test_type DROP ATTRIBUTE b;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+
+ALTER TYPE test_type DROP ATTRIBUTE c; -- fails
+ERROR:  column "c" of relation "test_type" does not exist
+ALTER TYPE test_type DROP ATTRIBUTE IF EXISTS c;
+NOTICE:  column "c" of relation "test_type" does not exist, skipping
+ALTER TYPE test_type DROP ATTRIBUTE a, ADD ATTRIBUTE d boolean;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ d      | boolean |           |          | 
+
+ALTER TYPE test_type RENAME ATTRIBUTE a TO aa;
+ERROR:  column "a" does not exist
+ALTER TYPE test_type RENAME ATTRIBUTE d TO dd;
+\d test_type
+         Composite type "public.test_type"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ dd     | boolean |           |          | 
+
+DROP TYPE test_type;
+CREATE TYPE test_type1 AS (a int, b text);
+CREATE TABLE test_tbl1 (x int, y test_type1);
+ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type1" because column "test_tbl1.y" uses it
+CREATE TYPE test_type2 AS (a int, b text);
+CREATE TABLE test_tbl2 OF test_type2;
+CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2);
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 ADD ATTRIBUTE c text; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 ADD ATTRIBUTE c text CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar CASCADE;
+\d test_type2
+             Composite type "public.test_type2"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+ c      | text              |           |          | 
+
+\d test_tbl2
+                  Table "public.test_tbl2"
+ Column |       Type        | Collation | Nullable | Default 
+--------+-------------------+-----------+----------+---------
+ a      | integer           |           |          | 
+ b      | character varying |           |          | 
+ c      | text              |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 DROP ATTRIBUTE b; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 DROP ATTRIBUTE b CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa; -- fails
+ERROR:  cannot alter type "test_type2" because it is the type of a typed table
+HINT:  Use ALTER ... CASCADE to alter the typed tables too.
+ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE;
+\d test_type2
+        Composite type "public.test_type2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+
+\d test_tbl2
+             Table "public.test_tbl2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+Number of child tables: 1 (Use \d+ to list them.)
+Typed table of type: test_type2
+
+\d test_tbl2_subclass
+         Table "public.test_tbl2_subclass"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ aa     | integer |           |          | 
+ c      | text    |           |          | 
+Inherits: test_tbl2
+
+DROP TABLE test_tbl2_subclass;
+CREATE TYPE test_typex AS (a int, b text);
+CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0));
+ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails
+ERROR:  cannot drop column a of composite type test_typex because other objects depend on it
+DETAIL:  constraint test_tblx_y_check on table test_tblx depends on column a of composite type test_typex
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+ALTER TYPE test_typex DROP ATTRIBUTE a CASCADE;
+NOTICE:  drop cascades to constraint test_tblx_y_check on table test_tblx
+\d test_tblx
+               Table "public.test_tblx"
+ Column |    Type    | Collation | Nullable | Default 
+--------+------------+-----------+----------+---------
+ x      | integer    |           |          | 
+ y      | test_typex |           |          | 
+
+DROP TABLE test_tblx;
+DROP TYPE test_typex;
+-- This test isn't that interesting on its own, but the purpose is to leave
+-- behind a table to test pg_upgrade with. The table has a composite type
+-- column in it, and the composite type has a dropped attribute.
+CREATE TYPE test_type3 AS (a int);
+CREATE TABLE test_tbl3 (c) AS SELECT '(1)'::test_type3;
+ALTER TYPE test_type3 DROP ATTRIBUTE a, ADD ATTRIBUTE b int;
+CREATE TYPE test_type_empty AS ();
+DROP TYPE test_type_empty;
+--
+-- typed tables: OF / NOT OF
+--
+CREATE TYPE tt_t0 AS (z inet, x int, y numeric(8,2));
+ALTER TYPE tt_t0 DROP ATTRIBUTE z;
+CREATE TABLE tt0 (x int NOT NULL, y numeric(8,2));	-- OK
+CREATE TABLE tt1 (x int, y bigint);					-- wrong base type
+CREATE TABLE tt2 (x int, y numeric(9,2));			-- wrong typmod
+CREATE TABLE tt3 (y numeric(8,2), x int);			-- wrong column order
+CREATE TABLE tt4 (x int);							-- too few columns
+CREATE TABLE tt5 (x int, y numeric(8,2), z int);	-- too few columns
+CREATE TABLE tt6 () INHERITS (tt0);					-- can't have a parent
+CREATE TABLE tt7 (x int, q text, y numeric(8,2));
+ALTER TABLE tt7 DROP q;								-- OK
+ALTER TABLE tt0 OF tt_t0;
+ALTER TABLE tt1 OF tt_t0;
+ERROR:  table "tt1" has different type for column "y"
+ALTER TABLE tt2 OF tt_t0;
+ERROR:  table "tt2" has different type for column "y"
+ALTER TABLE tt3 OF tt_t0;
+ERROR:  table has column "y" where type requires "x"
+ALTER TABLE tt4 OF tt_t0;
+ERROR:  table is missing column "y"
+ALTER TABLE tt5 OF tt_t0;
+ERROR:  table has extra column "z"
+ALTER TABLE tt6 OF tt_t0;
+ERROR:  typed tables cannot inherit
+ALTER TABLE tt7 OF tt_t0;
+CREATE TYPE tt_t1 AS (x int, y numeric(8,2));
+ALTER TABLE tt7 OF tt_t1;			-- reassign an already-typed table
+ALTER TABLE tt7 NOT OF;
+\d tt7
+                   Table "public.tt7"
+ Column |     Type     | Collation | Nullable | Default 
+--------+--------------+-----------+----------+---------
+ x      | integer      |           |          | 
+ y      | numeric(8,2) |           |          | 
+
+-- make sure we can drop a constraint on the parent but it remains on the child
+CREATE TABLE test_drop_constr_parent (c text CHECK (c IS NOT NULL));
+CREATE TABLE test_drop_constr_child () INHERITS (test_drop_constr_parent);
+ALTER TABLE ONLY test_drop_constr_parent DROP CONSTRAINT "test_drop_constr_parent_c_check";
+-- should fail
+INSERT INTO test_drop_constr_child (c) VALUES (NULL);
+ERROR:  new row for relation "test_drop_constr_child" violates check constraint "test_drop_constr_parent_c_check"
+DETAIL:  Failing row contains (null).
+DROP TABLE test_drop_constr_parent CASCADE;
+NOTICE:  drop cascades to table test_drop_constr_child
+--
+-- IF EXISTS test
+--
+ALTER TABLE IF EXISTS tt8 ADD COLUMN f int;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f);
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10);
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1;
+NOTICE:  relation "tt8" does not exist, skipping
+ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2;
+NOTICE:  relation "tt8" does not exist, skipping
+CREATE TABLE tt8(a int);
+CREATE SCHEMA alter2;
+ALTER TABLE IF EXISTS tt8 ADD COLUMN f int;
+ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f);
+ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10);
+ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0;
+ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1;
+ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2;
+\d alter2.tt8
+                Table "alter2.tt8"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ f1     | integer |           | not null | 0
+Indexes:
+    "xxx" PRIMARY KEY, btree (f1)
+Check constraints:
+    "tt8_f_check" CHECK (f1 >= 0 AND f1 <= 10)
+
+DROP TABLE alter2.tt8;
+DROP SCHEMA alter2;
+--
+-- Check conflicts between index and CHECK constraint names
+--
+CREATE TABLE tt9(c integer);
+ALTER TABLE tt9 ADD CHECK(c > 1);
+ALTER TABLE tt9 ADD CHECK(c > 2);  -- picks nonconflicting name
+ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 3);
+ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 4);  -- fail, dup name
+ERROR:  constraint "foo" for relation "tt9" already exists
+ALTER TABLE tt9 ADD UNIQUE(c);
+ALTER TABLE tt9 ADD UNIQUE(c);  -- picks nonconflicting name
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key UNIQUE(c);  -- fail, dup name
+ERROR:  relation "tt9_c_key" already exists
+ALTER TABLE tt9 ADD CONSTRAINT foo UNIQUE(c);  -- fail, dup name
+ERROR:  constraint "foo" for relation "tt9" already exists
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key CHECK(c > 5);  -- fail, dup name
+ERROR:  constraint "tt9_c_key" for relation "tt9" already exists
+ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key2 CHECK(c > 6);
+ALTER TABLE tt9 ADD UNIQUE(c);  -- picks nonconflicting name
+\d tt9
+                Table "public.tt9"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c      | integer |           |          | 
+Indexes:
+    "tt9_c_key" UNIQUE CONSTRAINT, btree (c)
+    "tt9_c_key1" UNIQUE CONSTRAINT, btree (c)
+    "tt9_c_key3" UNIQUE CONSTRAINT, btree (c)
+Check constraints:
+    "foo" CHECK (c > 3)
+    "tt9_c_check" CHECK (c > 1)
+    "tt9_c_check1" CHECK (c > 2)
+    "tt9_c_key2" CHECK (c > 6)
+
+DROP TABLE tt9;
+-- Check that comments on constraints and indexes are not lost at ALTER TABLE.
+CREATE TABLE comment_test (
+  id int,
+  positive_col int CHECK (positive_col > 0),
+  indexed_col int,
+  CONSTRAINT comment_test_pk PRIMARY KEY (id));
+CREATE INDEX comment_test_index ON comment_test(indexed_col);
+COMMENT ON COLUMN comment_test.id IS 'Column ''id'' on comment_test';
+COMMENT ON INDEX comment_test_index IS 'Simple index on comment_test';
+COMMENT ON CONSTRAINT comment_test_positive_col_check ON comment_test IS 'CHECK constraint on comment_test.positive_col';
+COMMENT ON CONSTRAINT comment_test_pk ON comment_test IS 'PRIMARY KEY constraint of comment_test';
+COMMENT ON INDEX comment_test_pk IS 'Index backing the PRIMARY KEY of comment_test';
+SELECT col_description('comment_test'::regclass, 1) as comment;
+           comment           
+-----------------------------
+ Column 'id' on comment_test
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2;
+       index        |                    comment                    
+--------------------+-----------------------------------------------
+ comment_test_index | Simple index on comment_test
+ comment_test_pk    | Index backing the PRIMARY KEY of comment_test
+(2 rows)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2;
+           constraint            |                    comment                    
+---------------------------------+-----------------------------------------------
+ comment_test_pk                 | PRIMARY KEY constraint of comment_test
+ comment_test_positive_col_check | CHECK constraint on comment_test.positive_col
+(2 rows)
+
+-- Change the datatype of all the columns. ALTER TABLE is optimized to not
+-- rebuild an index if the new data type is binary compatible with the old
+-- one. Check do a dummy ALTER TABLE that doesn't change the datatype
+-- first, to test that no-op codepath, and another one that does.
+ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE int;
+ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE bigint;
+-- Check that the comments are intact.
+SELECT col_description('comment_test'::regclass, 1) as comment;
+           comment           
+-----------------------------
+ Column 'id' on comment_test
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2;
+       index        |                    comment                    
+--------------------+-----------------------------------------------
+ comment_test_index | Simple index on comment_test
+ comment_test_pk    | Index backing the PRIMARY KEY of comment_test
+(2 rows)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2;
+           constraint            |                    comment                    
+---------------------------------+-----------------------------------------------
+ comment_test_pk                 | PRIMARY KEY constraint of comment_test
+ comment_test_positive_col_check | CHECK constraint on comment_test.positive_col
+(2 rows)
+
+-- Check compatibility for foreign keys and comments. This is done
+-- separately as rebuilding the column type of the parent leads
+-- to an error and would reduce the test scope.
+CREATE TABLE comment_test_child (
+  id text CONSTRAINT comment_test_child_fk REFERENCES comment_test);
+CREATE INDEX comment_test_child_fk ON comment_test_child(id);
+COMMENT ON COLUMN comment_test_child.id IS 'Column ''id'' on comment_test_child';
+COMMENT ON INDEX comment_test_child_fk IS 'Index backing the FOREIGN KEY of comment_test_child';
+COMMENT ON CONSTRAINT comment_test_child_fk ON comment_test_child IS 'FOREIGN KEY constraint of comment_test_child';
+-- Change column type of parent
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text;
+ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int USING id::integer;
+ERROR:  foreign key constraint "comment_test_child_fk" cannot be implemented
+DETAIL:  Key columns "id" and "id" are of incompatible types: text and integer.
+-- Comments should be intact
+SELECT col_description('comment_test_child'::regclass, 1) as comment;
+              comment              
+-----------------------------------
+ Column 'id' on comment_test_child
+(1 row)
+
+SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test_child'::regclass ORDER BY 1, 2;
+         index         |                       comment                       
+-----------------------+-----------------------------------------------------
+ comment_test_child_fk | Index backing the FOREIGN KEY of comment_test_child
+(1 row)
+
+SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test_child'::regclass ORDER BY 1, 2;
+      constraint       |                   comment                    
+-----------------------+----------------------------------------------
+ comment_test_child_fk | FOREIGN KEY constraint of comment_test_child
+(1 row)
+
+-- Check that we map relation oids to filenodes and back correctly.  Only
+-- display bad mappings so the test output doesn't change all the time.  A
+-- filenode function call can return NULL for a relation dropped concurrently
+-- with the call's surrounding query, so ignore a NULL mapped_oid for
+-- relations that no longer exist after all calls finish.
+CREATE TEMP TABLE filenode_mapping AS
+SELECT
+    oid, mapped_oid, reltablespace, relfilenode, relname
+FROM pg_class,
+    pg_filenode_relation(reltablespace, pg_relation_filenode(oid)) AS mapped_oid
+WHERE relkind IN ('r', 'i', 'S', 't', 'm') AND mapped_oid IS DISTINCT FROM oid;
+SELECT m.* FROM filenode_mapping m LEFT JOIN pg_class c ON c.oid = m.oid
+WHERE c.oid IS NOT NULL OR m.mapped_oid IS NOT NULL;
+ oid | mapped_oid | reltablespace | relfilenode | relname 
+-----+------------+---------------+-------------+---------
+(0 rows)
+
+-- Checks on creating and manipulation of user defined relations in
+-- pg_catalog.
+SHOW allow_system_table_mods;
+ allow_system_table_mods 
+-------------------------
+ off
+(1 row)
+
+-- disallowed because of search_path issues with pg_dump
+CREATE TABLE pg_catalog.new_system_table();
+ERROR:  permission denied to create "pg_catalog.new_system_table"
+DETAIL:  System catalog modifications are currently disallowed.
+-- instead create in public first, move to catalog
+CREATE TABLE new_system_table(id serial primary key, othercol text);
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+ALTER TABLE new_system_table SET SCHEMA public;
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+-- will be ignored -- already there:
+ALTER TABLE new_system_table SET SCHEMA pg_catalog;
+ALTER TABLE new_system_table RENAME TO old_system_table;
+CREATE INDEX old_system_table__othercol ON old_system_table (othercol);
+INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata');
+UPDATE old_system_table SET id = -id;
+DELETE FROM old_system_table WHERE othercol = 'somedata';
+TRUNCATE old_system_table;
+ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey;
+ALTER TABLE old_system_table DROP COLUMN othercol;
+DROP TABLE old_system_table;
+-- set logged
+CREATE UNLOGGED TABLE unlogged1(f1 SERIAL PRIMARY KEY, f2 TEXT);
+-- check relpersistence of an unlogged table
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1'
+ORDER BY relname;
+     relname      | relkind | relpersistence 
+------------------+---------+----------------
+ toast index      | i       | p
+ toast table      | t       | p
+ unlogged1        | r       | p
+ unlogged1_f1_seq | S       | p
+ unlogged1_pkey   | i       | p
+(5 rows)
+
+CREATE UNLOGGED TABLE unlogged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged1); -- foreign key
+CREATE UNLOGGED TABLE unlogged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged3); -- self-referencing foreign key
+ALTER TABLE unlogged3 SET LOGGED; -- skip self-referencing foreign key
+ALTER TABLE unlogged2 SET LOGGED; -- fails because a foreign key to an unlogged table exists
+ALTER TABLE unlogged1 SET LOGGED;
+-- check relpersistence of an unlogged table after changing to permanent
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1'
+ORDER BY relname;
+     relname      | relkind | relpersistence 
+------------------+---------+----------------
+ toast index      | i       | p
+ toast table      | t       | p
+ unlogged1        | r       | p
+ unlogged1_f1_seq | S       | p
+ unlogged1_pkey   | i       | p
+(5 rows)
+
+ALTER TABLE unlogged1 SET LOGGED; -- silently do nothing
+DROP TABLE unlogged3;
+DROP TABLE unlogged2;
+DROP TABLE unlogged1;
+-- set unlogged
+CREATE TABLE logged1(f1 SERIAL PRIMARY KEY, f2 TEXT);
+-- check relpersistence of a permanent table
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1'
+ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ logged1        | r       | p
+ logged1_f1_seq | S       | p
+ logged1_pkey   | i       | p
+ toast index    | i       | p
+ toast table    | t       | p
+(5 rows)
+
+CREATE TABLE logged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged1); -- foreign key
+CREATE TABLE logged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged3); -- self-referencing foreign key
+ALTER TABLE logged1 SET UNLOGGED; -- fails because a foreign key from a permanent table exists
+ERROR:  could not change table "logged1" to unlogged because it references logged table "logged2"
+ALTER TABLE logged3 SET UNLOGGED; -- skip self-referencing foreign key
+ALTER TABLE logged2 SET UNLOGGED;
+ALTER TABLE logged1 SET UNLOGGED;
+-- check relpersistence of a permanent table after changing to unlogged
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1'
+UNION ALL
+SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1'
+UNION ALL
+SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1'
+ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ logged1        | r       | u
+ logged1_f1_seq | S       | p
+ logged1_pkey   | i       | u
+ toast index    | i       | u
+ toast table    | t       | u
+(5 rows)
+
+ALTER TABLE logged1 SET UNLOGGED; -- silently do nothing
+DROP TABLE logged3;
+DROP TABLE logged2;
+DROP TABLE logged1;
+-- test ADD COLUMN IF NOT EXISTS
+CREATE TABLE test_add_column(c1 integer);
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer;
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer; -- fail because c2 already exists
+ERROR:  column "c2" of relation "test_add_column" already exists
+ALTER TABLE ONLY test_add_column
+	ADD COLUMN c2 integer; -- fail because c2 already exists
+ERROR:  column "c2" of relation "test_add_column" already exists
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+ALTER TABLE ONLY test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN c2 integer, -- fail because c2 already exists
+	ADD COLUMN c3 integer primary key;
+ERROR:  column "c2" of relation "test_add_column" already exists
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN c3 integer primary key;
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN IF NOT EXISTS c3 integer primary key; -- skipping because c3 already exists
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+NOTICE:  column "c3" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists
+	ADD COLUMN IF NOT EXISTS c3 integer, -- skipping because c3 already exists
+	ADD COLUMN c4 integer REFERENCES test_add_column;
+NOTICE:  column "c2" of relation "test_add_column" already exists, skipping
+NOTICE:  column "c3" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c4 integer REFERENCES test_add_column;
+NOTICE:  column "c4" of relation "test_add_column" already exists, skipping
+\d test_add_column
+          Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 8);
+\d test_add_column
+                            Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable |                   Default                   
+--------+---------+-----------+----------+---------------------------------------------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+ c5     | integer |           | not null | nextval('test_add_column_c5_seq'::regclass)
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Check constraints:
+    "test_add_column_c5_check" CHECK (c5 > 8)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+ALTER TABLE test_add_column
+	ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 10);
+NOTICE:  column "c5" of relation "test_add_column" already exists, skipping
+\d test_add_column*
+                            Table "public.test_add_column"
+ Column |  Type   | Collation | Nullable |                   Default                   
+--------+---------+-----------+----------+---------------------------------------------
+ c1     | integer |           |          | 
+ c2     | integer |           |          | 
+ c3     | integer |           | not null | 
+ c4     | integer |           |          | 
+ c5     | integer |           | not null | nextval('test_add_column_c5_seq'::regclass)
+Indexes:
+    "test_add_column_pkey" PRIMARY KEY, btree (c3)
+Check constraints:
+    "test_add_column_c5_check" CHECK (c5 > 8)
+Foreign-key constraints:
+    "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+Referenced by:
+    TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3)
+
+               Sequence "public.test_add_column_c5_seq"
+  Type   | Start | Minimum |  Maximum   | Increment | Cycles? | Cache 
+---------+-------+---------+------------+-----------+---------+-------
+ integer |     1 |       1 | 2147483647 |         1 | no      |     1
+Owned by: public.test_add_column.c5
+
+ Index "public.test_add_column_pkey"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ c3     | integer | yes  | c3
+primary key, btree, for table "public.test_add_column"
+
+DROP TABLE test_add_column;
+\d test_add_column*
+-- assorted cases with multiple ALTER TABLE steps
+CREATE TABLE ataddindex(f1 INT);
+INSERT INTO ataddindex VALUES (42), (43);
+CREATE UNIQUE INDEX ataddindexi0 ON ataddindex(f1);
+ALTER TABLE ataddindex
+  ADD PRIMARY KEY USING INDEX ataddindexi0,
+  ALTER f1 TYPE BIGINT;
+\d ataddindex
+            Table "public.ataddindex"
+ Column |  Type  | Collation | Nullable | Default 
+--------+--------+-----------+----------+---------
+ f1     | bigint |           | not null | 
+Indexes:
+    "ataddindexi0" PRIMARY KEY, btree (f1)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(f1 VARCHAR(10));
+INSERT INTO ataddindex(f1) VALUES ('foo'), ('a');
+ALTER TABLE ataddindex
+  ALTER f1 SET DATA TYPE TEXT,
+  ADD EXCLUDE ((f1 LIKE 'a') WITH =);
+\d ataddindex
+           Table "public.ataddindex"
+ Column | Type | Collation | Nullable | Default 
+--------+------+-----------+----------+---------
+ f1     | text |           |          | 
+Indexes:
+    "ataddindex_expr_excl" EXCLUDE USING btree ((f1 ~~ 'a'::text) WITH =)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+  ADD PRIMARY KEY (id),
+  ADD FOREIGN KEY (ref_id) REFERENCES ataddindex;
+\d ataddindex
+             Table "public.ataddindex"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           | not null | 
+ ref_id | integer |           |          | 
+Indexes:
+    "ataddindex_pkey" PRIMARY KEY, btree (id)
+Foreign-key constraints:
+    "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+Referenced by:
+    TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+
+DROP TABLE ataddindex;
+CREATE TABLE ataddindex(id int, ref_id int);
+ALTER TABLE ataddindex
+  ADD UNIQUE (id),
+  ADD FOREIGN KEY (ref_id) REFERENCES ataddindex (id);
+\d ataddindex
+             Table "public.ataddindex"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ ref_id | integer |           |          | 
+Indexes:
+    "ataddindex_id_key" UNIQUE CONSTRAINT, btree (id)
+Foreign-key constraints:
+    "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+Referenced by:
+    TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id)
+
+DROP TABLE ataddindex;
+-- unsupported constraint types for partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (a, (a+b+1));
+ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
+                                    ^
+-- cannot drop column that is part of the partition key
+ALTER TABLE partitioned DROP COLUMN a;
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
+ERROR:  cannot alter column "a" because it is part of the partition key of relation "partitioned"
+ALTER TABLE partitioned DROP COLUMN b;
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "partitioned"
+ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "partitioned"
+-- partitioned table cannot participate in regular inheritance
+CREATE TABLE nonpartitioned (
+	a int,
+	b int
+);
+ALTER TABLE partitioned INHERIT nonpartitioned;
+ERROR:  cannot change inheritance of partitioned table
+ALTER TABLE nonpartitioned INHERIT partitioned;
+ERROR:  cannot inherit from partitioned table "partitioned"
+-- cannot add NO INHERIT constraint to partitioned tables
+ALTER TABLE partitioned ADD CONSTRAINT chk_a CHECK (a > 0) NO INHERIT;
+ERROR:  cannot add NO INHERIT constraint to partitioned table "partitioned"
+DROP TABLE partitioned, nonpartitioned;
+--
+-- ATTACH PARTITION
+--
+-- check that target table is partitioned
+CREATE TABLE unparted (
+	a int
+);
+CREATE TABLE fail_part (like unparted);
+ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a');
+ERROR:  table "unparted" is not partitioned
+DROP TABLE unparted, fail_part;
+-- check that partition bound is compatible
+CREATE TABLE list_parted (
+	a int NOT NULL,
+	b char(2) COLLATE "C",
+	CONSTRAINT check_a CHECK (a > 0)
+) PARTITION BY LIST (a);
+CREATE TABLE fail_part (LIKE list_parted);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) TO (10);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) T...
+                                                             ^
+DROP TABLE fail_part;
+-- check that the table being attached exists
+ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ERROR:  relation "nonexistent" does not exist
+-- check ownership of the source table
+CREATE ROLE regress_test_me;
+CREATE ROLE regress_test_not_me;
+CREATE TABLE not_owned_by_me (LIKE list_parted);
+ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+SET SESSION AUTHORIZATION regress_test_me;
+CREATE TABLE owned_by_me (
+	a int
+) PARTITION BY LIST (a);
+ALTER TABLE owned_by_me ATTACH PARTITION not_owned_by_me FOR VALUES IN (1);
+ERROR:  must be owner of table not_owned_by_me
+RESET SESSION AUTHORIZATION;
+DROP TABLE owned_by_me, not_owned_by_me;
+DROP ROLE regress_test_not_me;
+DROP ROLE regress_test_me;
+-- check that the table being attached is not part of regular inheritance
+CREATE TABLE parent (LIKE list_parted);
+CREATE TABLE child () INHERITS (parent);
+ALTER TABLE list_parted ATTACH PARTITION child FOR VALUES IN (1);
+ERROR:  cannot attach inheritance child as partition
+ALTER TABLE list_parted ATTACH PARTITION parent FOR VALUES IN (1);
+ERROR:  cannot attach inheritance parent as partition
+DROP TABLE parent CASCADE;
+NOTICE:  drop cascades to table child
+-- check any TEMP-ness
+CREATE TEMP TABLE temp_parted (a int) PARTITION BY LIST (a);
+CREATE TABLE perm_part (a int);
+ALTER TABLE temp_parted ATTACH PARTITION perm_part FOR VALUES IN (1);
+ERROR:  cannot attach a permanent relation as partition of temporary relation "temp_parted"
+DROP TABLE temp_parted, perm_part;
+-- check that the table being attached is not a typed table
+CREATE TYPE mytype AS (a int);
+CREATE TABLE fail_part OF mytype;
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  cannot attach a typed table as partition
+DROP TYPE mytype CASCADE;
+NOTICE:  drop cascades to table fail_part
+-- check that the table being attached has only columns present in the parent
+CREATE TABLE fail_part (like list_parted, c int);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  table "fail_part" contains column "c" not found in parent "list_parted"
+DETAIL:  The new partition may contain only the columns present in parent.
+DROP TABLE fail_part;
+-- check that the table being attached has every column of the parent
+CREATE TABLE fail_part (a int NOT NULL);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table is missing column "b"
+DROP TABLE fail_part;
+-- check that columns match in type, collation and NOT NULL status
+CREATE TABLE fail_part (
+	b char(3),
+	a int NOT NULL
+);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different type for column "b"
+ALTER TABLE fail_part ALTER b TYPE char (2) COLLATE "POSIX";
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different collation for column "b"
+DROP TABLE fail_part;
+-- check that the table being attached has all constraints of the parent
+CREATE TABLE fail_part (
+	b char(2) COLLATE "C",
+	a int NOT NULL
+);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table is missing constraint "check_a"
+-- check that the constraint matches in definition with parent's constraint
+ALTER TABLE fail_part ADD CONSTRAINT check_a CHECK (a >= 0);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  child table "fail_part" has different definition for check constraint "check_a"
+DROP TABLE fail_part;
+-- check the attributes and constraints after partition is attached
+CREATE TABLE part_1 (
+	a int NOT NULL,
+	b char(2) COLLATE "C",
+	CONSTRAINT check_a CHECK (a > 0)
+);
+ALTER TABLE list_parted ATTACH PARTITION part_1 FOR VALUES IN (1);
+-- attislocal and conislocal are always false for merged attributes and constraints respectively.
+SELECT attislocal, attinhcount FROM pg_attribute WHERE attrelid = 'part_1'::regclass AND attnum > 0;
+ attislocal | attinhcount 
+------------+-------------
+ f          |           1
+ f          |           1
+(2 rows)
+
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::regclass AND conname = 'check_a';
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+(1 row)
+
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+ERROR:  partition "fail_part" would overlap partition "part_1"
+LINE 1: ...LE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+                                                                    ^
+DROP TABLE fail_part;
+-- check that an existing table can be attached as a default partition
+CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
+-- check attaching default partition fails if a default partition already
+-- exists
+CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
+ERROR:  partition "fail_def_part" conflicts with existing default partition "def_part"
+LINE 1: ...ER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
+                                                               ^
+-- check validation when attaching list partitions
+CREATE TABLE list_parted2 (
+	a int,
+	b char
+) PARTITION BY LIST (a);
+-- check that violating rows are correctly reported
+CREATE TABLE part_2 (LIKE list_parted2);
+INSERT INTO part_2 VALUES (3, 'a');
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+ERROR:  partition constraint of relation "part_2" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part_2;
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+-- check partition cannot be attached if default has some row for its values
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+INSERT INTO list_parted2_def VALUES (11, 'z');
+CREATE TABLE part_3 (LIKE list_parted2);
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+ERROR:  updated partition constraint for default partition "list_parted2_def" would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM list_parted2_def WHERE a = 11;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+-- adding constraints that describe the desired partition constraint
+-- (or more restrictive) will help skip the validation scan
+CREATE TABLE part_3_4 (
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IN (3))
+);
+-- however, if a list partition does not accept nulls, there should be
+-- an explicit NOT NULL constraint on the partition key column for the
+-- validation scan to be skipped;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
+-- adding a NOT NULL constraint will cause the scan to be skipped
+ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
+ALTER TABLE part_3_4 ALTER a SET NOT NULL;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
+-- check if default partition scan skipped
+ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6));
+CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66);
+-- check validation when attaching range partitions
+CREATE TABLE range_parted (
+	a int,
+	b int
+) PARTITION BY RANGE (a, b);
+-- check that violating rows are correctly reported
+CREATE TABLE part1 (
+	a int NOT NULL CHECK (a = 1),
+	b int NOT NULL CHECK (b >= 1 AND b <= 10)
+);
+INSERT INTO part1 VALUES (1, 10);
+-- Remember the TO bound is exclusive
+ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10);
+ERROR:  partition constraint of relation "part1" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part1;
+ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10);
+-- adding constraints that describe the desired partition constraint
+-- (or more restrictive) will help skip the validation scan
+CREATE TABLE part2 (
+	a int NOT NULL CHECK (a = 1),
+	b int NOT NULL CHECK (b >= 10 AND b < 18)
+);
+ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20);
+-- Create default partition
+CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT;
+-- Only one default partition is allowed, hence, following should give error
+CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS);
+ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+ERROR:  partition "partr_def2" conflicts with existing default partition "partr_def1"
+LINE 1: ...LTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+                                                               ^
+-- Overlapping partitions cannot be attached, hence, following should give error
+INSERT INTO partr_def1 VALUES (2, 10);
+CREATE TABLE part3 (LIKE range_parted);
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20);
+ERROR:  updated partition constraint for default partition "partr_def1" would be violated by some row
+-- Attaching partitions should be successful when there are no overlapping rows
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE part_5 (
+	LIKE list_parted2
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE part_5_a PARTITION OF part_5 FOR VALUES IN ('a');
+INSERT INTO part_5_a (a, b) VALUES (6, 'a');
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+ERROR:  partition constraint of relation "part_5_a" is violated by some row
+-- delete the faulting row and also add a constraint to skip the scan
+DELETE FROM part_5_a WHERE a NOT IN (3);
+ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 5);
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+ALTER TABLE list_parted2 DETACH PARTITION part_5;
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+-- scan should again be skipped, even though NOT NULL is now a column property
+ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IN (5)), ALTER a SET NOT NULL;
+ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5);
+-- Check the case where attnos of the partitioning columns in the table being
+-- attached differs from the parent.  It should not affect the constraint-
+-- checking logic that allows to skip the scan.
+CREATE TABLE part_6 (
+	c int,
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 6)
+);
+ALTER TABLE part_6 DROP c;
+ALTER TABLE list_parted2 ATTACH PARTITION part_6 FOR VALUES IN (6);
+-- Similar to above, but the table being attached is a partitioned table
+-- whose partition has still different attnos for the root partitioning
+-- columns.
+CREATE TABLE part_7 (
+	LIKE list_parted2,
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7)
+) PARTITION BY LIST (b);
+CREATE TABLE part_7_a_null (
+	c int,
+	d int,
+	e int,
+	LIKE list_parted2,  -- 'a' will have attnum = 4
+	CONSTRAINT check_b CHECK (b IS NULL OR b = 'a'),
+	CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7)
+);
+ALTER TABLE part_7_a_null DROP c, DROP d, DROP e;
+ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null);
+ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+-- Same example, but check this time that the constraint correctly detects
+-- violating rows
+ALTER TABLE list_parted2 DETACH PARTITION part_7;
+ALTER TABLE part_7 DROP CONSTRAINT check_a; -- thusly, scan won't be skipped
+INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a');
+SELECT tableoid::regclass, a, b FROM part_7 order by a;
+   tableoid    | a | b 
+---------------+---+---
+ part_7_a_null | 8 | 
+ part_7_a_null | 9 | a
+(2 rows)
+
+ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+ERROR:  partition constraint of relation "part_7_a_null" is violated by some row
+-- check that leaf partitions of default partition are scanned when
+-- attaching a partitioned table.
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a);
+CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5);
+INSERT INTO part5_def_p1 VALUES (5, 'y');
+CREATE TABLE part5_p1 (LIKE part_5);
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+ERROR:  updated partition constraint for default partition "part5_def_p1" would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part5_def_p1 WHERE b = 'y';
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+-- check that the table being attached is not already a partition
+ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+ERROR:  "part_2" is already a partition
+-- check that circular inheritance is not allowed
+ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b');
+ERROR:  circular inheritance not allowed
+DETAIL:  "part_5" is already a child of "list_parted2".
+ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
+ERROR:  circular inheritance not allowed
+DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- If a partitioned table being created or an existing table being attached
+-- as a partition does not have a constraint that would allow validation scan
+-- to be skipped, but an individual partition does, then the partition's
+-- validation scan is skipped.
+CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
+CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
+CREATE TABLE quuux_default1 PARTITION OF quuux_default (
+	CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1)
+) FOR VALUES IN ('b');
+CREATE TABLE quuux1 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
+CREATE TABLE quuux2 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
+DROP TABLE quuux1, quuux2;
+-- should validate for quuux1, but not for quuux2
+CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
+CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
+DROP TABLE quuux;
+-- check validation when attaching hash partitions
+-- Use hand-rolled hash functions and operator class to get predictable result
+-- on different machines. part_test_int4_ops is defined in insert.sql.
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+	a int,
+	b int
+) PARTITION BY HASH (a part_test_int4_ops);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU...
+                                                             ^
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU...
+                                                             ^
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint of relation "hpart_2" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+	LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint of relation "hpart_5_a" is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DETAIL:  The new modulus 3 is not a factor of 4, the modulus of existing partition "hpart_1".
+DROP TABLE fail_part;
+--
+-- DETACH PARTITION
+--
+-- check that the table is partitioned at all
+CREATE TABLE regular_table (a int);
+ALTER TABLE regular_table DETACH PARTITION any_name;
+ERROR:  table "regular_table" is not partitioned
+DROP TABLE regular_table;
+-- check that the partition being detached exists at all
+ALTER TABLE list_parted2 DETACH PARTITION part_4;
+ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
+-- check that the partition being detached is actually a partition of the parent
+CREATE TABLE not_a_part (a int);
+ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
+ALTER TABLE list_parted2 DETACH PARTITION part_1;
+ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
+-- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
+-- attislocal/conislocal is set to true
+ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
+SELECT attinhcount, attislocal FROM pg_attribute WHERE attrelid = 'part_3_4'::regclass AND attnum > 0;
+ attinhcount | attislocal 
+-------------+------------
+           0 | t
+           0 | t
+(2 rows)
+
+SELECT coninhcount, conislocal FROM pg_constraint WHERE conrelid = 'part_3_4'::regclass AND conname = 'check_a';
+ coninhcount | conislocal 
+-------------+------------
+           0 | t
+(1 row)
+
+DROP TABLE part_3_4;
+-- check that a detached partition is not dropped on dropping a partitioned table
+CREATE TABLE range_parted2 (
+    a int
+) PARTITION BY RANGE(a);
+CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100);
+ALTER TABLE range_parted2 DETACH PARTITION part_rp;
+DROP TABLE range_parted2;
+SELECT * from part_rp;
+ a 
+---
+(0 rows)
+
+DROP TABLE part_rp;
+-- concurrent detach
+CREATE TABLE range_parted2 (
+	a int
+) PARTITION BY RANGE(a);
+CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100);
+BEGIN;
+-- doesn't work in a partition block
+ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY;
+ERROR:  ALTER TABLE ... DETACH CONCURRENTLY cannot run inside a transaction block
+COMMIT;
+CREATE TABLE part_rpd PARTITION OF range_parted2 DEFAULT;
+-- doesn't work if there's a default partition
+ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY;
+ERROR:  cannot detach partitions concurrently when a default partition exists
+-- doesn't work for the default partition
+ALTER TABLE range_parted2 DETACH PARTITION part_rpd CONCURRENTLY;
+ERROR:  cannot detach partitions concurrently when a default partition exists
+DROP TABLE part_rpd;
+-- works fine
+ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY;
+\d+ range_parted2
+                         Partitioned table "public.range_parted2"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: RANGE (a)
+Number of partitions: 0
+
+-- constraint should be created
+\d part_rp
+              Table "public.part_rp"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Check constraints:
+    "part_rp_a_check" CHECK (a IS NOT NULL AND a >= 0 AND a < 100)
+
+CREATE TABLE part_rp100 PARTITION OF range_parted2 (CHECK (a>=123 AND a<133 AND a IS NOT NULL)) FOR VALUES FROM (100) to (200);
+ALTER TABLE range_parted2 DETACH PARTITION part_rp100 CONCURRENTLY;
+-- redundant constraint should not be created
+\d part_rp100
+             Table "public.part_rp100"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Check constraints:
+    "part_rp100_a_check" CHECK (a >= 123 AND a < 133 AND a IS NOT NULL)
+
+DROP TABLE range_parted2;
+-- Check ALTER TABLE commands for partitioned tables and partitions
+-- cannot add/drop column to/from *only* the parent
+ALTER TABLE ONLY list_parted2 ADD COLUMN c int;
+ERROR:  column must be added to child tables too
+ALTER TABLE ONLY list_parted2 DROP COLUMN b;
+ERROR:  cannot drop column from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+-- cannot add a column to partition or drop an inherited one
+ALTER TABLE part_2 ADD COLUMN c text;
+ERROR:  cannot add column to a partition
+ALTER TABLE part_2 DROP COLUMN b;
+ERROR:  cannot drop inherited column "b"
+-- Nor rename, alter type
+ALTER TABLE part_2 RENAME COLUMN b to c;
+ERROR:  cannot rename inherited column "b"
+ALTER TABLE part_2 ALTER COLUMN b TYPE text;
+ERROR:  cannot alter inherited column "b"
+-- cannot add/drop NOT NULL or check constraints to *only* the parent, when
+-- partitions exist
+ALTER TABLE ONLY list_parted2 ALTER b SET NOT NULL;
+ERROR:  constraint must be added to child tables too
+DETAIL:  Column "b" of relation "part_2" is not already NOT NULL.
+HINT:  Do not specify the ONLY keyword.
+ALTER TABLE ONLY list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz');
+ERROR:  constraint must be added to child tables too
+ALTER TABLE list_parted2 ALTER b SET NOT NULL;
+ALTER TABLE ONLY list_parted2 ALTER b DROP NOT NULL;
+ERROR:  cannot remove constraint from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+ALTER TABLE list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz');
+ALTER TABLE ONLY list_parted2 DROP CONSTRAINT check_b;
+ERROR:  cannot remove constraint from only the partitioned table when partitions exist
+HINT:  Do not specify the ONLY keyword.
+-- It's alright though, if no partitions are yet created
+CREATE TABLE parted_no_parts (a int) PARTITION BY LIST (a);
+ALTER TABLE ONLY parted_no_parts ALTER a SET NOT NULL;
+ALTER TABLE ONLY parted_no_parts ADD CONSTRAINT check_a CHECK (a > 0);
+ALTER TABLE ONLY parted_no_parts ALTER a DROP NOT NULL;
+ALTER TABLE ONLY parted_no_parts DROP CONSTRAINT check_a;
+DROP TABLE parted_no_parts;
+-- cannot drop inherited NOT NULL or check constraints from partition
+ALTER TABLE list_parted2 ALTER b SET NOT NULL, ADD CONSTRAINT check_a2 CHECK (a > 0);
+ALTER TABLE part_2 ALTER b DROP NOT NULL;
+ERROR:  column "b" is marked NOT NULL in parent table
+ALTER TABLE part_2 DROP CONSTRAINT check_a2;
+ERROR:  cannot drop inherited constraint "check_a2" of relation "part_2"
+-- Doesn't make sense to add NO INHERIT constraints on partitioned tables
+ALTER TABLE list_parted2 add constraint check_b2 check (b <> 'zz') NO INHERIT;
+ERROR:  cannot add NO INHERIT constraint to partitioned table "list_parted2"
+-- check that a partition cannot participate in regular inheritance
+CREATE TABLE inh_test () INHERITS (part_2);
+ERROR:  cannot inherit from partition "part_2"
+CREATE TABLE inh_test (LIKE part_2);
+ALTER TABLE inh_test INHERIT part_2;
+ERROR:  cannot inherit from a partition
+ALTER TABLE part_2 INHERIT inh_test;
+ERROR:  cannot change inheritance of a partition
+-- cannot drop or alter type of partition key columns of lower level
+-- partitioned tables; for example, part_5, which is list_parted2's
+-- partition, is partitioned on b;
+ALTER TABLE list_parted2 DROP COLUMN b;
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "part_5"
+ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "part_5"
+-- dropping non-partition key columns should be allowed on the parent table.
+ALTER TABLE list_parted DROP COLUMN b;
+SELECT * FROM list_parted;
+ a 
+---
+(0 rows)
+
+-- cleanup
+DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE fail_def_part;
+DROP TABLE hash_parted;
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass)
+order by attrelid::regclass::text;
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+insert into p1 (a, b) values (2, 3);
+-- check that partition validation scan correctly detects violating rows
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+ERROR:  partition constraint of relation "p11" is violated by some row
+-- cleanup
+drop table p;
+drop table p1;
+-- validate constraint on partitioned tables should only scan leaf partitions
+create table parted_validate_test (a int) partition by list (a);
+create table parted_validate_test_1 partition of parted_validate_test for values in (0, 1);
+alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
+alter table parted_validate_test validate constraint parted_validate_test_chka;
+drop table parted_validate_test;
+-- test alter column options
+CREATE TABLE attmp(i integer);
+INSERT INTO attmp VALUES (1);
+ALTER TABLE attmp ALTER COLUMN i SET (n_distinct = 1, n_distinct_inherited = 2);
+ALTER TABLE attmp ALTER COLUMN i RESET (n_distinct_inherited);
+ANALYZE attmp;
+DROP TABLE attmp;
+DROP USER regress_alter_table_user1;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (b int, a int);
+alter table defpart_attach_test_d drop b;
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint of relation "defpart_attach_test_d" is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+-- check that attaching a partition correctly reports any rows in the default
+-- partition that should not be there for the new partition to be attached
+-- successfully
+create table defpart_attach_test_2 (like defpart_attach_test_d);
+alter table defpart_attach_test attach partition defpart_attach_test_2 for values in (2);
+ERROR:  updated partition constraint for default partition "defpart_attach_test_d" would be violated by some row
+drop table defpart_attach_test;
+-- check combinations of temporary and permanent relations when attaching
+-- partitions.
+create table perm_part_parent (a int) partition by list (a);
+create temp table temp_part_parent (a int) partition by list (a);
+create table perm_part_child (a int);
+create temp table temp_part_child (a int);
+alter table temp_part_parent attach partition perm_part_child default; -- error
+ERROR:  cannot attach a permanent relation as partition of temporary relation "temp_part_parent"
+alter table perm_part_parent attach partition temp_part_child default; -- error
+ERROR:  cannot attach a temporary relation as partition of permanent relation "perm_part_parent"
+alter table temp_part_parent attach partition temp_part_child default; -- ok
+drop table perm_part_parent cascade;
+drop table temp_part_parent cascade;
+-- check that attaching partitions to a table while it is being used is
+-- prevented
+create table tab_part_attach (a int) partition by list (a);
+create or replace function func_part_attach() returns trigger
+  language plpgsql as $$
+  begin
+    execute 'create table tab_part_attach_1 (a int)';
+    execute 'alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)';
+    return null;
+  end $$;
+create trigger trig_part_attach before insert on tab_part_attach
+  for each statement execute procedure func_part_attach();
+insert into tab_part_attach values (1);
+ERROR:  cannot ALTER TABLE "tab_part_attach" because it is being used by active queries in this session
+CONTEXT:  SQL statement "alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)"
+PL/pgSQL function func_part_attach() line 4 at EXECUTE
+drop table tab_part_attach;
+drop function func_part_attach();
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
+/* Test case for bug #16242 */
+-- We create a parent and child where the child has missing
+-- non-null attribute values, and arrange to pass them through
+-- tuple conversion from the child to the parent tupdesc
+create table bar1 (a integer, b integer not null default 1)
+  partition by range (a);
+create table bar2 (a integer);
+insert into bar2 values (1);
+alter table bar2 add column b integer not null default 1;
+-- (at this point bar2 contains tuple with natts=1)
+alter table bar1 attach partition bar2 default;
+-- this works:
+select * from bar1;
+ a | b 
+---+---
+ 1 | 1
+(1 row)
+
+-- this exercises tuple conversion:
+create function xtrig()
+  returns trigger language plpgsql
+as $$
+  declare
+    r record;
+  begin
+    for r in select * from old loop
+      raise info 'a=%, b=%', r.a, r.b;
+    end loop;
+    return NULL;
+  end;
+$$;
+create trigger xtrig
+  after update on bar1
+  referencing old table as old
+  for each statement execute procedure xtrig();
+update bar1 set a = a + 1;
+INFO:  a=1, b=1
+/* End test case for bug #16242 */
+-- Test that ALTER TABLE rewrite preserves a clustered index
+-- for normal indexes and indexes on constraints.
+create table alttype_cluster (a int);
+alter table alttype_cluster add primary key (a);
+create index alttype_cluster_ind on alttype_cluster (a);
+alter table alttype_cluster cluster on alttype_cluster_ind;
+-- Normal index remains clustered.
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | t
+ alttype_cluster_pkey | f
+(2 rows)
+
+alter table alttype_cluster alter a type bigint;
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | t
+ alttype_cluster_pkey | f
+(2 rows)
+
+-- Constraint index remains clustered.
+alter table alttype_cluster cluster on alttype_cluster_pkey;
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | f
+ alttype_cluster_pkey | t
+(2 rows)
+
+alter table alttype_cluster alter a type int;
+select indexrelid::regclass, indisclustered from pg_index
+  where indrelid = 'alttype_cluster'::regclass
+  order by indexrelid::regclass::text;
+      indexrelid      | indisclustered 
+----------------------+----------------
+ alttype_cluster_ind  | f
+ alttype_cluster_pkey | t
+(2 rows)
+
+drop table alttype_cluster;
diff --git a/src/test/regress/expected/create_table_1.out b/src/test/regress/expected/create_table_1.out
new file mode 100644
index 00000000000..4ec5f297a34
--- /dev/null
+++ b/src/test/regress/expected/create_table_1.out
@@ -0,0 +1,1315 @@
+--
+-- CREATE_TABLE
+--
+--
+-- CLASS DEFINITIONS
+--
+CREATE TABLE hobbies_r (
+	name		text,
+	person 		text
+);
+CREATE TABLE equipment_r (
+	name 		text,
+	hobby		text
+);
+CREATE TABLE onek (
+	unique1		int4,
+	unique2		int4,
+	two			int4,
+	four		int4,
+	ten			int4,
+	twenty		int4,
+	hundred		int4,
+	thousand	int4,
+	twothousand	int4,
+	fivethous	int4,
+	tenthous	int4,
+	odd			int4,
+	even		int4,
+	stringu1	name,
+	stringu2	name,
+	string4		name
+);
+CREATE TABLE tenk1 (
+	unique1		int4,
+	unique2		int4,
+	two			int4,
+	four		int4,
+	ten			int4,
+	twenty		int4,
+	hundred		int4,
+	thousand	int4,
+	twothousand	int4,
+	fivethous	int4,
+	tenthous	int4,
+	odd			int4,
+	even		int4,
+	stringu1	name,
+	stringu2	name,
+	string4		name
+);
+CREATE TABLE tenk2 (
+	unique1 	int4,
+	unique2 	int4,
+	two 	 	int4,
+	four 		int4,
+	ten			int4,
+	twenty 		int4,
+	hundred 	int4,
+	thousand 	int4,
+	twothousand int4,
+	fivethous 	int4,
+	tenthous	int4,
+	odd			int4,
+	even		int4,
+	stringu1	name,
+	stringu2	name,
+	string4		name
+);
+CREATE TABLE person (
+	name 		text,
+	age			int4,
+	location 	point
+);
+CREATE TABLE emp (
+	salary 		int4,
+	manager 	name
+) INHERITS (person);
+CREATE TABLE student (
+	gpa 		float8
+) INHERITS (person);
+CREATE TABLE stud_emp (
+	percent 	int4
+) INHERITS (emp, student);
+NOTICE:  merging multiple inherited definitions of column "name"
+NOTICE:  merging multiple inherited definitions of column "age"
+NOTICE:  merging multiple inherited definitions of column "location"
+CREATE TABLE city (
+	name		name,
+	location 	box,
+	budget 		city_budget
+);
+CREATE TABLE dept (
+	dname		name,
+	mgrname 	text
+);
+CREATE TABLE slow_emp4000 (
+	home_base	 box
+);
+CREATE TABLE fast_emp4000 (
+	home_base	 box
+);
+CREATE TABLE road (
+	name		text,
+	thepath 	path
+);
+CREATE TABLE ihighway () INHERITS (road);
+CREATE TABLE shighway (
+	surface		text
+) INHERITS (road);
+CREATE TABLE real_city (
+	pop			int4,
+	cname		text,
+	outline 	path
+);
+--
+-- test the "star" operators a bit more thoroughly -- this time,
+-- throw in lots of NULL fields...
+--
+-- a is the type root
+-- b and c inherit from a (one-level single inheritance)
+-- d inherits from b and c (two-level multiple inheritance)
+-- e inherits from c (two-level single inheritance)
+-- f inherits from e (three-level single inheritance)
+--
+CREATE TABLE a_star (
+	class		char,
+	a 			int4
+);
+CREATE TABLE b_star (
+	b 			text
+) INHERITS (a_star);
+CREATE TABLE c_star (
+	c 			name
+) INHERITS (a_star);
+CREATE TABLE d_star (
+	d 			float8
+) INHERITS (b_star, c_star);
+NOTICE:  merging multiple inherited definitions of column "class"
+NOTICE:  merging multiple inherited definitions of column "a"
+CREATE TABLE e_star (
+	e 			int2
+) INHERITS (c_star);
+CREATE TABLE f_star (
+	f 			polygon
+) INHERITS (e_star);
+CREATE TABLE aggtest (
+	a 			int2,
+	b			float4
+);
+CREATE TABLE hash_i4_heap (
+	seqno 		int4,
+	random 		int4
+);
+CREATE TABLE hash_name_heap (
+	seqno 		int4,
+	random 		name
+);
+CREATE TABLE hash_txt_heap (
+	seqno 		int4,
+	random 		text
+);
+CREATE TABLE hash_f8_heap (
+	seqno		int4,
+	random 		float8
+);
+-- don't include the hash_ovfl_heap stuff in the distribution
+-- the data set is too large for what it's worth
+--
+-- CREATE TABLE hash_ovfl_heap (
+--	x			int4,
+--	y			int4
+-- );
+CREATE TABLE bt_i4_heap (
+	seqno 		int4,
+	random 		int4
+);
+CREATE TABLE bt_name_heap (
+	seqno 		name,
+	random 		int4
+);
+CREATE TABLE bt_txt_heap (
+	seqno 		text,
+	random 		int4
+);
+CREATE TABLE bt_f8_heap (
+	seqno 		float8,
+	random 		int4
+);
+CREATE TABLE array_op_test (
+	seqno		int4,
+	i			int4[],
+	t			text[]
+);
+CREATE TABLE array_index_op_test (
+	seqno		int4,
+	i			int4[],
+	t			text[]
+);
+CREATE TABLE testjsonb (
+       j jsonb
+);
+CREATE TABLE unknowntab (
+	u unknown    -- fail
+);
+ERROR:  column "u" has pseudo-type unknown
+CREATE TYPE unknown_comptype AS (
+	u unknown    -- fail
+);
+ERROR:  column "u" has pseudo-type unknown
+CREATE TABLE IF NOT EXISTS test_tsvector(
+	t text,
+	a tsvector
+);
+CREATE TABLE IF NOT EXISTS test_tsvector(
+	t text
+);
+NOTICE:  relation "test_tsvector" already exists, skipping
+-- invalid: non-lowercase quoted reloptions identifiers
+CREATE TABLE tas_case WITH ("Fillfactor" = 10) AS SELECT 1 a;
+ERROR:  unrecognized parameter "Fillfactor"
+CREATE UNLOGGED TABLE unlogged1 (a int primary key);			-- OK
+CREATE TEMPORARY TABLE unlogged2 (a int primary key);			-- OK
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ unlogged1      | r       | p
+ unlogged1_pkey | i       | p
+ unlogged2      | r       | t
+ unlogged2_pkey | i       | t
+(4 rows)
+
+REINDEX INDEX unlogged1_pkey;
+REINDEX INDEX unlogged2_pkey;
+SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname;
+    relname     | relkind | relpersistence 
+----------------+---------+----------------
+ unlogged1      | r       | p
+ unlogged1_pkey | i       | p
+ unlogged2      | r       | t
+ unlogged2_pkey | i       | t
+(4 rows)
+
+DROP TABLE unlogged2;
+INSERT INTO unlogged1 VALUES (42);
+CREATE UNLOGGED TABLE public.unlogged2 (a int primary key);		-- also OK
+CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key);	-- not OK
+ERROR:  only temporary relations may be created in temporary schemas
+LINE 1: CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key);
+                              ^
+CREATE TABLE pg_temp.implicitly_temp (a int primary key);		-- OK
+CREATE TEMP TABLE explicitly_temp (a int primary key);			-- also OK
+CREATE TEMP TABLE pg_temp.doubly_temp (a int primary key);		-- also OK
+CREATE TEMP TABLE public.temp_to_perm (a int primary key);		-- not OK
+ERROR:  cannot create temporary relation in non-temporary schema
+LINE 1: CREATE TEMP TABLE public.temp_to_perm (a int primary key);
+                          ^
+DROP TABLE unlogged1, public.unlogged2;
+CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r';
+CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r';
+ERROR:  relation "as_select1" already exists
+CREATE TABLE IF NOT EXISTS as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r';
+NOTICE:  relation "as_select1" already exists, skipping
+DROP TABLE as_select1;
+PREPARE select1 AS SELECT 1 as a;
+CREATE TABLE as_select1 AS EXECUTE select1;
+CREATE TABLE as_select1 AS EXECUTE select1;
+ERROR:  relation "as_select1" already exists
+SELECT * FROM as_select1;
+ a 
+---
+ 1
+(1 row)
+
+CREATE TABLE IF NOT EXISTS as_select1 AS EXECUTE select1;
+NOTICE:  relation "as_select1" already exists, skipping
+DROP TABLE as_select1;
+DEALLOCATE select1;
+-- create an extra wide table to test for issues related to that
+-- (temporarily hide query, to avoid the long CREATE TABLE stmt)
+\set ECHO none
+INSERT INTO extra_wide_table(firstc, lastc) VALUES('first col', 'last col');
+SELECT firstc, lastc FROM extra_wide_table;
+  firstc   |  lastc   
+-----------+----------
+ first col | last col
+(1 row)
+
+-- check that tables with oids cannot be created anymore
+CREATE TABLE withoid() WITH OIDS;
+ERROR:  syntax error at or near "OIDS"
+LINE 1: CREATE TABLE withoid() WITH OIDS;
+                                    ^
+CREATE TABLE withoid() WITH (oids);
+ERROR:  tables declared WITH OIDS are not supported
+CREATE TABLE withoid() WITH (oids = true);
+ERROR:  tables declared WITH OIDS are not supported
+-- but explicitly not adding oids is still supported
+CREATE TEMP TABLE withoutoid() WITHOUT OIDS; DROP TABLE withoutoid;
+CREATE TEMP TABLE withoutoid() WITH (oids = false); DROP TABLE withoutoid;
+-- check restriction with default expressions
+-- invalid use of column reference in default expressions
+CREATE TABLE default_expr_column (id int DEFAULT (id));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (id));
+                                                          ^
+CREATE TABLE default_expr_column (id int DEFAULT (bar.id));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (bar.id));
+                                                          ^
+CREATE TABLE default_expr_agg_column (id int DEFAULT (avg(id)));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: ...TE TABLE default_expr_agg_column (id int DEFAULT (avg(id)));
+                                                                 ^
+-- invalid column definition
+CREATE TABLE default_expr_non_column (a int DEFAULT (avg(non_existent)));
+ERROR:  cannot use column reference in DEFAULT expression
+LINE 1: ...TABLE default_expr_non_column (a int DEFAULT (avg(non_existe...
+                                                             ^
+-- invalid use of aggregate
+CREATE TABLE default_expr_agg (a int DEFAULT (avg(1)));
+ERROR:  aggregate functions are not allowed in DEFAULT expressions
+LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (avg(1)));
+                                                      ^
+-- invalid use of subquery
+CREATE TABLE default_expr_agg (a int DEFAULT (select 1));
+ERROR:  cannot use subquery in DEFAULT expression
+LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (select 1));
+                                                     ^
+-- invalid use of set-returning function
+CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3)));
+ERROR:  set-returning functions are not allowed in DEFAULT expressions
+LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (generate_serie...
+                                                      ^
+-- Verify that subtransaction rollback restores rd_createSubid.
+BEGIN;
+CREATE TABLE remember_create_subid (c int);
+SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q;
+COMMIT;
+DROP TABLE remember_create_subid;
+-- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid.
+CREATE TABLE remember_node_subid (c int);
+BEGIN;
+ALTER TABLE remember_node_subid ALTER c TYPE bigint;
+SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q;
+COMMIT;
+DROP TABLE remember_node_subid;
+--
+-- Partitioned tables
+--
+-- cannot combine INHERITS and PARTITION BY (although grammar allows)
+CREATE TABLE partitioned (
+	a int
+) INHERITS (some_table) PARTITION BY LIST (a);
+ERROR:  cannot create partitioned table as inheritance child
+-- cannot use more than 1 column as partition key for list partitioned table
+CREATE TABLE partitioned (
+	a1 int,
+	a2 int
+) PARTITION BY LIST (a1, a2);	-- fail
+ERROR:  cannot use "list" partition strategy with more than one column
+-- unsupported constraint type for partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	EXCLUDE USING gist (a WITH &&)
+) PARTITION BY RANGE (a);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 3:  EXCLUDE USING gist (a WITH &&)
+         ^
+-- prevent using prohibited expressions in the key
+CREATE FUNCTION retset (a int) RETURNS SETOF int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE;
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (retset(a));
+ERROR:  set-returning functions are not allowed in partition key expressions
+DROP FUNCTION retset(int);
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE ((avg(a)));
+ERROR:  aggregate functions are not allowed in partition key expressions
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE ((avg(a) OVER (PARTITION BY b)));
+ERROR:  window functions are not allowed in partition key expressions
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY LIST ((a LIKE (SELECT 1)));
+ERROR:  cannot use subquery in partition key expression
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE ((42));
+ERROR:  cannot use constant expression as partition key
+CREATE FUNCTION const_func () RETURNS int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE;
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (const_func());
+ERROR:  cannot use constant expression as partition key
+DROP FUNCTION const_func();
+-- only accept valid partitioning strategy
+CREATE TABLE partitioned (
+    a int
+) PARTITION BY MAGIC (a);
+ERROR:  unrecognized partitioning strategy "magic"
+-- specified column must be present in the table
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (b);
+ERROR:  column "b" named in partition key does not exist
+LINE 3: ) PARTITION BY RANGE (b);
+                              ^
+-- cannot use system columns in partition key
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (xmin);
+ERROR:  cannot use system column "xmin" in partition key
+LINE 3: ) PARTITION BY RANGE (xmin);
+                              ^
+-- cannot use pseudotypes
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (((a, b)));
+ERROR:  partition key column 1 has pseudo-type record
+CREATE TABLE partitioned (
+	a int,
+	b int
+) PARTITION BY RANGE (a, ('unknown'));
+ERROR:  partition key column 2 has pseudo-type unknown
+-- functions in key must be immutable
+CREATE FUNCTION immut_func (a int) RETURNS int AS $$ SELECT a + random()::int; $$ LANGUAGE SQL;
+CREATE TABLE partitioned (
+	a int
+) PARTITION BY RANGE (immut_func(a));
+ERROR:  functions in partition key expression must be marked IMMUTABLE
+DROP FUNCTION immut_func(int);
+-- prevent using columns of unsupported types in key (type must have a btree operator class)
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY LIST (a);
+ERROR:  data type point has no default operator class for access method "btree"
+HINT:  You must specify a btree operator class or define a default btree operator class for the data type.
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY LIST (a point_ops);
+ERROR:  operator class "point_ops" does not exist for access method "btree"
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY RANGE (a);
+ERROR:  data type point has no default operator class for access method "btree"
+HINT:  You must specify a btree operator class or define a default btree operator class for the data type.
+CREATE TABLE partitioned (
+	a point
+) PARTITION BY RANGE (a point_ops);
+ERROR:  operator class "point_ops" does not exist for access method "btree"
+-- cannot add NO INHERIT constraints to partitioned tables
+CREATE TABLE partitioned (
+	a int,
+	CONSTRAINT check_a CHECK (a > 0) NO INHERIT
+) PARTITION BY RANGE (a);
+ERROR:  cannot add NO INHERIT constraint to partitioned table "partitioned"
+-- some checks after successful creation of a partitioned table
+CREATE FUNCTION plusone(a int) RETURNS INT AS $$ SELECT a+1; $$ LANGUAGE SQL;
+CREATE TABLE partitioned (
+	a int,
+	b int,
+	c text,
+	d text
+) PARTITION BY RANGE (a oid_ops, plusone(b), c collate "default", d collate "C");
+-- check relkind
+SELECT relkind FROM pg_class WHERE relname = 'partitioned';
+ relkind 
+---------
+ p
+(1 row)
+
+-- prevent a function referenced in partition key from being dropped
+DROP FUNCTION plusone(int);
+ERROR:  cannot drop function plusone(integer) because other objects depend on it
+DETAIL:  table partitioned depends on function plusone(integer)
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+-- partitioned table cannot participate in regular inheritance
+CREATE TABLE partitioned2 (
+	a int,
+	b text
+) PARTITION BY RANGE ((a+1), substr(b, 1, 5));
+CREATE TABLE fail () INHERITS (partitioned2);
+ERROR:  cannot inherit from partitioned table "partitioned2"
+-- Partition key in describe output
+\d partitioned
+      Partitioned table "public.partitioned"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+ d      | text    |           |          | 
+Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C")
+Number of partitions: 0
+
+\d+ partitioned2
+                          Partitioned table "public.partitioned2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer |           |          |         | plain    |              | 
+ b      | text    |           |          |         | extended |              | 
+Partition key: RANGE (((a + 1)), substr(b, 1, 5))
+Number of partitions: 0
+
+INSERT INTO partitioned2 VALUES (1, 'hello');
+ERROR:  no partition of relation "partitioned2" found for row
+DETAIL:  Partition key of the failing row contains ((a + 1), substr(b, 1, 5)) = (2, hello).
+CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc');
+\d+ part2_1
+                                  Table "public.part2_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer |           |          |         | plain    |              | 
+ b      | text    |           |          |         | extended |              | 
+Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc')
+Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text))))
+
+DROP TABLE partitioned, partitioned2;
+-- check reference to partitioned table's rowtype in partition descriptor
+create table partitioned (a int, b int)
+  partition by list ((row(a, b)::partitioned));
+create table partitioned1
+  partition of partitioned for values in ('(1,2)'::partitioned);
+create table partitioned2
+  partition of partitioned for values in ('(2,4)'::partitioned);
+explain (costs off)
+select * from partitioned where row(a,b)::partitioned = '(1,2)'::partitioned;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Seq Scan on partitioned1 partitioned
+   Filter: (ROW(a, b)::partitioned = '(1,2)'::partitioned)
+(2 rows)
+
+drop table partitioned;
+-- whole-row Var in partition key works too
+create table partitioned (a int, b int)
+  partition by list ((partitioned));
+create table partitioned1
+  partition of partitioned for values in ('(1,2)');
+create table partitioned2
+  partition of partitioned for values in ('(2,4)');
+explain (costs off)
+select * from partitioned where partitioned = '(1,2)'::partitioned;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Seq Scan on partitioned1 partitioned
+   Filter: ((partitioned.*)::partitioned = '(1,2)'::partitioned)
+(2 rows)
+
+\d+ partitioned1
+                               Table "public.partitioned1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+Partition of: partitioned FOR VALUES IN ('(1,2)')
+Partition constraint: (((partitioned1.*)::partitioned IS DISTINCT FROM NULL) AND ((partitioned1.*)::partitioned = '(1,2)'::partitioned))
+
+drop table partitioned;
+-- check that dependencies of partition columns are handled correctly
+create domain intdom1 as int;
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (a);
+alter table partitioned drop column a;  -- fail
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+drop domain intdom1;  -- fail, requires cascade
+ERROR:  cannot drop type intdom1 because other objects depend on it
+DETAIL:  table partitioned depends on type intdom1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+drop domain intdom1 cascade;
+NOTICE:  drop cascades to table partitioned
+table partitioned;  -- gone
+ERROR:  relation "partitioned" does not exist
+LINE 1: table partitioned;
+              ^
+-- likewise for columns used in partition expressions
+create domain intdom1 as int;
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (plusone(a));
+alter table partitioned drop column a;  -- fail
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+drop domain intdom1;  -- fail, requires cascade
+ERROR:  cannot drop type intdom1 because other objects depend on it
+DETAIL:  table partitioned depends on type intdom1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+drop domain intdom1 cascade;
+NOTICE:  drop cascades to table partitioned
+table partitioned;  -- gone
+ERROR:  relation "partitioned" does not exist
+LINE 1: table partitioned;
+              ^
+--
+-- Partitions
+--
+-- check partition bound syntax
+CREATE TABLE list_parted (
+	a int
+) PARTITION BY LIST (a);
+CREATE TABLE part_p1 PARTITION OF list_parted FOR VALUES IN ('1');
+CREATE TABLE part_p2 PARTITION OF list_parted FOR VALUES IN (2);
+CREATE TABLE part_p3 PARTITION OF list_parted FOR VALUES IN ((2+1));
+CREATE TABLE part_null PARTITION OF list_parted FOR VALUES IN (null);
+\d+ list_parted
+                          Partitioned table "public.list_parted"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Partitions: part_null FOR VALUES IN (NULL),
+            part_p1 FOR VALUES IN (1),
+            part_p2 FOR VALUES IN (2),
+            part_p3 FOR VALUES IN (3)
+
+-- forbidden expressions for partition bound with list partitioned table
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename);
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename);
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename.somename);
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename.s...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a);
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ..._bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a);
+                                                                    ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a));
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a));
+                                                                   ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(somename));
+ERROR:  cannot use column reference in partition bound expression
+LINE 1: ..._fail PARTITION OF list_parted FOR VALUES IN (sum(somename))...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1));
+ERROR:  aggregate functions are not allowed in partition bound
+LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1));
+                                                               ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1));
+ERROR:  cannot use subquery in partition bound
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1)...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (generate_series(4, 6));
+ERROR:  set-returning functions are not allowed in partition bound
+LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (generate_s...
+                                                             ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "POSIX");
+ERROR:  collations are not supported by type integer
+LINE 1: ...ail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "P...
+                                                             ^
+-- syntax does not allow empty list of values for list partitions
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN ();
+ERROR:  syntax error at or near ")"
+LINE 1: ...E TABLE fail_part PARTITION OF list_parted FOR VALUES IN ();
+                                                                     ^
+-- trying to specify range for list partitioned table
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T...
+                                                             ^
+-- trying to specify modulus and remainder for list partitioned table
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- check default partition cannot be created more than once
+CREATE TABLE part_default PARTITION OF list_parted DEFAULT;
+CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
+ERROR:  partition "fail_default_part" conflicts with existing default partition "part_default"
+LINE 1: ...TE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
+                                                               ^
+-- specified literal can't be cast to the partition column data type
+CREATE TABLE bools (
+	a bool
+) PARTITION BY LIST (a);
+CREATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1);
+ERROR:  specified value cannot be cast to type boolean for column "a"
+LINE 1: ...REATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1);
+                                                                    ^
+DROP TABLE bools;
+-- specified literal can be cast, and the cast might not be immutable
+CREATE TABLE moneyp (
+	a money
+) PARTITION BY LIST (a);
+CREATE TABLE moneyp_10 PARTITION OF moneyp FOR VALUES IN (10);
+CREATE TABLE moneyp_11 PARTITION OF moneyp FOR VALUES IN ('11');
+CREATE TABLE moneyp_12 PARTITION OF moneyp FOR VALUES IN (to_char(12, '99')::int);
+DROP TABLE moneyp;
+-- cast is immutable
+CREATE TABLE bigintp (
+	a bigint
+) PARTITION BY LIST (a);
+CREATE TABLE bigintp_10 PARTITION OF bigintp FOR VALUES IN (10);
+-- fails due to overlap:
+CREATE TABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10');
+ERROR:  partition "bigintp_10_2" would overlap partition "bigintp_10"
+LINE 1: ...ABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10');
+                                                                 ^
+DROP TABLE bigintp;
+CREATE TABLE range_parted (
+	a date
+) PARTITION BY RANGE (a);
+-- forbidden expressions for partition bounds with range partitioned table
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (somename) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (somename) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (somename.somename) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (somename.somename) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (a) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (a) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (max(a)) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (max(a)) TO ('2019-01-01');
+                               ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (max(somename)) TO ('2019-01-01');
+ERROR:  cannot use column reference in partition bound expression
+LINE 2:   FOR VALUES FROM (max(somename)) TO ('2019-01-01');
+                               ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01');
+ERROR:  aggregate functions are not allowed in partition bound
+LINE 2:   FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01'...
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM ((select 1)) TO ('2019-01-01');
+ERROR:  cannot use subquery in partition bound
+LINE 2:   FOR VALUES FROM ((select 1)) TO ('2019-01-01');
+                           ^
+CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
+  FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01');
+ERROR:  set-returning functions are not allowed in partition bound
+LINE 2:   FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01');
+                           ^
+-- trying to specify list for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...BLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
+                                                              ^
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- each of start and end bounds must have same number of values as the
+-- length of the partition key
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z');
+ERROR:  FROM must specify exactly one value per partitioning column
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a') TO ('z', 1);
+ERROR:  TO must specify exactly one value per partitioning column
+-- cannot specify null values in range bounds
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue);
+ERROR:  cannot specify NULL in range bound
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- check partition bound syntax for the hash partition
+CREATE TABLE hash_parted (
+	a int
+) PARTITION BY HASH (a);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0);
+CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1);
+CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2);
+-- modulus 25 is factor of modulus of 50 but 10 is not a factor of 25.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DETAIL:  The new modulus 25 is not divisible by 10, the modulus of existing partition "hpart_1".
+-- previous modulus 50 is factor of 150 but this modulus is not a factor of next modulus 200.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DETAIL:  The new modulus 150 is not a factor of 200, the modulus of existing partition "hpart_3".
+-- trying to specify range for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z');
+ERROR:  invalid bound specification for a hash partition
+LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a',...
+                                                             ^
+-- trying to specify list value for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+ERROR:  invalid bound specification for a hash partition
+LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+                                                             ^
+-- trying to create default partition for the hash partitioned table
+CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT;
+ERROR:  a hash-partitioned table may not have a default partition
+-- check if compatible with the specified parent
+-- cannot create as partition of a non-partitioned table
+CREATE TABLE unparted (
+	a int
+);
+CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a');
+ERROR:  "unparted" is not partitioned
+CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+ERROR:  "unparted" is not partitioned
+DROP TABLE unparted;
+-- cannot create a permanent rel as partition of a temp rel
+CREATE TEMP TABLE temp_parted (
+	a int
+) PARTITION BY LIST (a);
+CREATE TABLE fail_part PARTITION OF temp_parted FOR VALUES IN ('a');
+ERROR:  cannot create a permanent relation as partition of temporary relation "temp_parted"
+DROP TABLE temp_parted;
+-- check for partition bound overlap and other invalid specifications
+CREATE TABLE list_parted2 (
+	a varchar
+) PARTITION BY LIST (a);
+CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z');
+CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b');
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null);
+ERROR:  partition "fail_part" would overlap partition "part_null_z"
+LINE 1: ...LE fail_part PARTITION OF list_parted2 FOR VALUES IN (null);
+                                                                 ^
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c');
+ERROR:  partition "fail_part" would overlap partition "part_ab"
+LINE 1: ...ail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c');
+                                                             ^
+-- check default partition overlap
+INSERT INTO list_parted2 VALUES('X');
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y');
+ERROR:  updated partition constraint for default partition "list_parted2_def" would be violated by some row
+CREATE TABLE range_parted2 (
+	a int
+) PARTITION BY RANGE (a);
+-- trying to create range partition with empty range
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0);
+ERROR:  empty range bound specified for partition "fail_part"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0);
+                                                             ^
+DETAIL:  Specified lower bound (1) is greater than or equal to upper bound (0).
+-- note that the range '[1, 1)' has no elements
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1);
+ERROR:  empty range bound specified for partition "fail_part"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1);
+                                                             ^
+DETAIL:  Specified lower bound (1) is greater than or equal to upper bound (1).
+CREATE TABLE part0 PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (1);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (2);
+ERROR:  partition "fail_part" would overlap partition "part0"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) ...
+                                                             ^
+CREATE TABLE part1 PARTITION OF range_parted2 FOR VALUES FROM (1) TO (10);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1);
+ERROR:  partition "fail_part" would overlap partition "part0"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1)...
+                                                             ^
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (maxvalue);
+ERROR:  partition "fail_part" would overlap partition "part1"
+LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (max...
+                                                             ^
+CREATE TABLE part2 PARTITION OF range_parted2 FOR VALUES FROM (20) TO (30);
+CREATE TABLE part3 PARTITION OF range_parted2 FOR VALUES FROM (30) TO (40);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30);
+ERROR:  partition "fail_part" would overlap partition "part2"
+LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30);
+                                                                   ^
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50);
+ERROR:  partition "fail_part" would overlap partition "part2"
+LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50);
+                                                                   ^
+-- Create a default partition for range partitioned table
+CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT;
+-- More than one default partition is not allowed, so this should give error
+CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT;
+ERROR:  partition "fail_default_part" conflicts with existing default partition "range2_default"
+LINE 1: ... TABLE fail_default_part PARTITION OF range_parted2 DEFAULT;
+                                                               ^
+-- Check if the range for default partitions overlap
+INSERT INTO range_parted2 VALUES (85);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90);
+ERROR:  updated partition constraint for default partition "range2_default" would be violated by some row
+CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100);
+-- now check for multi-column range partition key
+CREATE TABLE range_parted3 (
+	a int,
+	b int
+) PARTITION BY RANGE (a, (b+1));
+CREATE TABLE part00 PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, maxvalue);
+CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, 1);
+ERROR:  partition "fail_part" would overlap partition "part00"
+LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalu...
+                                                             ^
+CREATE TABLE part10 PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, 1);
+CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10);
+CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue);
+CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20);
+ERROR:  partition "fail_part" would overlap partition "part12"
+LINE 1: ...rt PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1,...
+                                                             ^
+CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT;
+-- cannot create a partition that says column b is allowed to range
+-- from -infinity to +infinity, while there exist partitions that have
+-- more specific ranges
+CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue);
+ERROR:  partition "fail_part" would overlap partition "part10"
+LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalu...
+                                                             ^
+-- check for partition bound overlap and other invalid specifications for the hash partition
+CREATE TABLE hash_parted2 (
+	a varchar
+) PARTITION BY HASH (a);
+CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5);
+-- overlap with part_4
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+ERROR:  partition "fail_part" would overlap partition "h2part_4"
+LINE 1: ...LE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODU...
+                                                             ^
+-- modulus must be greater than zero
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+-- remainder must be greater than or equal to zero and less than modulus
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+-- check schema propagation from parent
+CREATE TABLE parted (
+	a text,
+	b int NOT NULL DEFAULT 0,
+	CONSTRAINT check_a CHECK (length(a) > 0)
+) PARTITION BY LIST (a);
+CREATE TABLE part_a PARTITION OF parted FOR VALUES IN ('a');
+-- only inherited attributes (never local ones)
+SELECT attname, attislocal, attinhcount FROM pg_attribute
+  WHERE attrelid = 'part_a'::regclass and attnum > 0
+  ORDER BY attnum;
+ attname | attislocal | attinhcount 
+---------+------------+-------------
+ a       | f          |           1
+ b       | f          |           1
+(2 rows)
+
+-- able to specify column default, column constraint, and table constraint
+-- first check the "column specified more than once" error
+CREATE TABLE part_b PARTITION OF parted (
+	b NOT NULL,
+	b DEFAULT 1,
+	b CHECK (b >= 0),
+	CONSTRAINT check_a CHECK (length(a) > 0)
+) FOR VALUES IN ('b');
+ERROR:  column "b" specified more than once
+CREATE TABLE part_b PARTITION OF parted (
+	b NOT NULL DEFAULT 1,
+	CONSTRAINT check_a CHECK (length(a) > 0),
+	CONSTRAINT check_b CHECK (b >= 0)
+) FOR VALUES IN ('b');
+NOTICE:  merging constraint "check_a" with inherited definition
+-- conislocal should be false for any merged constraints, true otherwise
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass ORDER BY conislocal, coninhcount;
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+ t          |           0
+(2 rows)
+
+-- Once check_b is added to the parent, it should be made non-local for part_b
+ALTER TABLE parted ADD CONSTRAINT check_b CHECK (b >= 0);
+NOTICE:  merging constraint "check_b" with inherited definition
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass;
+ conislocal | coninhcount 
+------------+-------------
+ f          |           1
+ f          |           1
+(2 rows)
+
+-- Neither check_a nor check_b are droppable from part_b
+ALTER TABLE part_b DROP CONSTRAINT check_a;
+ERROR:  cannot drop inherited constraint "check_a" of relation "part_b"
+ALTER TABLE part_b DROP CONSTRAINT check_b;
+ERROR:  cannot drop inherited constraint "check_b" of relation "part_b"
+-- And dropping it from parted should leave no trace of them on part_b, unlike
+-- traditional inheritance where they will be left behind, because they would
+-- be local constraints.
+ALTER TABLE parted DROP CONSTRAINT check_a, DROP CONSTRAINT check_b;
+SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass;
+ conislocal | coninhcount 
+------------+-------------
+(0 rows)
+
+-- specify PARTITION BY for a partition
+CREATE TABLE fail_part_col_not_found PARTITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c);
+ERROR:  column "c" named in partition key does not exist
+LINE 1: ...TITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c);
+                                                                    ^
+CREATE TABLE part_c PARTITION OF parted (b WITH OPTIONS NOT NULL DEFAULT 0) FOR VALUES IN ('c') PARTITION BY RANGE ((b));
+-- create a level-2 partition
+CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10);
+-- check that NOT NULL and default value are inherited correctly
+create table parted_notnull_inh_test (a int default 1, b int not null default 0) partition by list (a);
+create table parted_notnull_inh_test1 partition of parted_notnull_inh_test (a not null, b default 1) for values in (1);
+insert into parted_notnull_inh_test (b) values (null);
+ERROR:  null value in column "b" of relation "parted_notnull_inh_test1" violates not-null constraint
+DETAIL:  Failing row contains (1, null).
+-- note that while b's default is overriden, a's default is preserved
+\d parted_notnull_inh_test1
+      Table "public.parted_notnull_inh_test1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 1
+ b      | integer |           | not null | 1
+Partition of: parted_notnull_inh_test FOR VALUES IN (1)
+
+drop table parted_notnull_inh_test;
+-- check that collations are assigned in partition bound expressions
+create table parted_boolean_col (a bool, b text) partition by list(a);
+create table parted_boolean_less partition of parted_boolean_col
+  for values in ('foo' < 'bar');
+create table parted_boolean_greater partition of parted_boolean_col
+  for values in ('foo' > 'bar');
+drop table parted_boolean_col;
+-- check for a conflicting COLLATE clause
+create table parted_collate_must_match (a text collate "C", b text collate "C")
+  partition by range (a);
+-- on the partition key
+create table parted_collate_must_match1 partition of parted_collate_must_match
+  (a collate "POSIX") for values from ('a') to ('m');
+-- on another column
+create table parted_collate_must_match2 partition of parted_collate_must_match
+  (b collate "POSIX") for values from ('m') to ('z');
+drop table parted_collate_must_match;
+-- check that non-matching collations for partition bound
+-- expressions are coerced to the right collation
+create table test_part_coll_posix (a text) partition by range (a collate "POSIX");
+-- ok, collation is implicitly coerced
+create table test_part_coll partition of test_part_coll_posix for values from ('a' collate "C") to ('g');
+-- ok
+create table test_part_coll2 partition of test_part_coll_posix for values from ('g') to ('m');
+-- ok, collation is implicitly coerced
+create table test_part_coll_cast partition of test_part_coll_posix for values from (name 'm' collate "C") to ('s');
+-- ok; partition collation silently overrides the default collation of type 'name'
+create table test_part_coll_cast2 partition of test_part_coll_posix for values from (name 's') to ('z');
+drop table test_part_coll_posix;
+-- Partition bound in describe output
+\d+ part_b
+                                   Table "public.part_b"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           | not null | 1       | plain    |              | 
+Partition of: parted FOR VALUES IN ('b')
+Partition constraint: ((a IS NOT NULL) AND (a = 'b'::text))
+
+-- Both partition bound and partition key in describe output
+\d+ part_c
+                             Partitioned table "public.part_c"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           | not null | 0       | plain    |              | 
+Partition of: parted FOR VALUES IN ('c')
+Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text))
+Partition key: RANGE (b)
+Partitions: part_c_1_10 FOR VALUES FROM (1) TO (10)
+
+-- a level-2 partition's constraint will include the parent's expressions
+\d+ part_c_1_10
+                                Table "public.part_c_1_10"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           | not null | 0       | plain    |              | 
+Partition of: part_c FOR VALUES FROM (1) TO (10)
+Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text) AND (b IS NOT NULL) AND (b >= 1) AND (b < 10))
+
+-- Show partition count in the parent's describe output
+-- Tempted to include \d+ output listing partitions with bound info but
+-- output could vary depending on the order in which partition oids are
+-- returned.
+\d parted
+         Partitioned table "public.parted"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | text    |           |          | 
+ b      | integer |           | not null | 0
+Partition key: LIST (a)
+Number of partitions: 3 (Use \d+ to list them.)
+
+\d hash_parted
+      Partitioned table "public.hash_parted"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: HASH (a)
+Number of partitions: 3 (Use \d+ to list them.)
+
+-- check that we get the expected partition constraints
+CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c);
+CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE);
+\d+ unbounded_range_part
+                           Table "public.unbounded_range_part"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL))
+
+DROP TABLE unbounded_range_part;
+CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE);
+\d+ range_parted4_1
+                              Table "public.range_parted4_1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND (abs(a) <= 1))
+
+CREATE TABLE range_parted4_2 PARTITION OF range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE);
+\d+ range_parted4_2
+                              Table "public.range_parted4_2"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 3) OR ((abs(a) = 3) AND (abs(b) > 4)) OR ((abs(a) = 3) AND (abs(b) = 4) AND (c >= 5))) AND ((abs(a) < 6) OR ((abs(a) = 6) AND (abs(b) <= 7))))
+
+CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE);
+\d+ range_parted4_3
+                              Table "public.range_parted4_3"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+ b      | integer |           |          |         | plain   |              | 
+ c      | integer |           |          |         | plain   |              | 
+Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE)
+Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 6) OR ((abs(a) = 6) AND (abs(b) >= 8))) AND (abs(a) <= 9))
+
+DROP TABLE range_parted4;
+-- user-defined operator class in partition key
+CREATE FUNCTION my_int4_sort(int4,int4) RETURNS int LANGUAGE sql
+  AS $$ SELECT CASE WHEN $1 = $2 THEN 0 WHEN $1 > $2 THEN 1 ELSE -1 END; $$;
+CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING btree AS
+  OPERATOR 1 < (int4,int4), OPERATOR 2 <= (int4,int4),
+  OPERATOR 3 = (int4,int4), OPERATOR 4 >= (int4,int4),
+  OPERATOR 5 > (int4,int4), FUNCTION 1 my_int4_sort(int4,int4);
+CREATE TABLE partkey_t (a int4) PARTITION BY RANGE (a test_int4_ops);
+CREATE TABLE partkey_t_1 PARTITION OF partkey_t FOR VALUES FROM (0) TO (1000);
+INSERT INTO partkey_t VALUES (100);
+INSERT INTO partkey_t VALUES (200);
+-- cleanup
+DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3;
+DROP TABLE partkey_t, hash_parted, hash_parted2;
+DROP OPERATOR CLASS test_int4_ops USING btree;
+DROP FUNCTION my_int4_sort(int4,int4);
+-- comments on partitioned tables columns
+CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a);
+COMMENT ON TABLE parted_col_comment IS 'Am partitioned table';
+COMMENT ON COLUMN parted_col_comment.a IS 'Partition key';
+SELECT obj_description('parted_col_comment'::regclass);
+   obj_description    
+----------------------
+ Am partitioned table
+(1 row)
+
+\d+ parted_col_comment
+                        Partitioned table "public.parted_col_comment"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target |  Description  
+--------+---------+-----------+----------+---------+----------+--------------+---------------
+ a      | integer |           |          |         | plain    |              | Partition key
+ b      | text    |           |          |         | extended |              | 
+Partition key: LIST (a)
+Number of partitions: 0
+
+DROP TABLE parted_col_comment;
+-- list partitioning on array type column
+CREATE TABLE arrlp (a int[]) PARTITION BY LIST (a);
+CREATE TABLE arrlp12 PARTITION OF arrlp FOR VALUES IN ('{1}', '{2}');
+\d+ arrlp12
+                                   Table "public.arrlp12"
+ Column |   Type    | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+-----------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer[] |           |          |         | extended |              | 
+Partition of: arrlp FOR VALUES IN ('{1}', '{2}')
+Partition constraint: ((a IS NOT NULL) AND ((a = '{1}'::integer[]) OR (a = '{2}'::integer[])))
+
+DROP TABLE arrlp;
+-- partition on boolean column
+create table boolspart (a bool) partition by list (a);
+create table boolspart_t partition of boolspart for values in (true);
+create table boolspart_f partition of boolspart for values in (false);
+\d+ boolspart
+                           Partitioned table "public.boolspart"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | boolean |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Partitions: boolspart_f FOR VALUES IN (false),
+            boolspart_t FOR VALUES IN (true)
+
+drop table boolspart;
+-- partitions mixing temporary and permanent relations
+create table perm_parted (a int) partition by list (a);
+create temporary table temp_parted (a int) partition by list (a);
+create table perm_part partition of temp_parted default; -- error
+ERROR:  cannot create a permanent relation as partition of temporary relation "temp_parted"
+create temp table temp_part partition of perm_parted default; -- error
+ERROR:  cannot create a temporary relation as partition of permanent relation "perm_parted"
+create temp table temp_part partition of temp_parted default; -- ok
+drop table perm_parted cascade;
+drop table temp_parted cascade;
+-- check that adding partitions to a table while it is being used is prevented
+create table tab_part_create (a int) partition by list (a);
+create or replace function func_part_create() returns trigger
+  language plpgsql as $$
+  begin
+    execute 'create table tab_part_create_1 partition of tab_part_create for values in (1)';
+    return null;
+  end $$;
+create trigger trig_part_create before insert on tab_part_create
+  for each statement execute procedure func_part_create();
+insert into tab_part_create values (1);
+ERROR:  cannot CREATE TABLE .. PARTITION OF "tab_part_create" because it is being used by active queries in this session
+CONTEXT:  SQL statement "create table tab_part_create_1 partition of tab_part_create for values in (1)"
+PL/pgSQL function func_part_create() line 3 at EXECUTE
+drop table tab_part_create;
+drop function func_part_create();
+-- test using a volatile expression as partition bound
+create table volatile_partbound_test (partkey timestamp) partition by range (partkey);
+create table volatile_partbound_test1 partition of volatile_partbound_test for values from (minvalue) to (current_timestamp);
+create table volatile_partbound_test2 partition of volatile_partbound_test for values from (current_timestamp) to (maxvalue);
+-- this should go into the partition volatile_partbound_test2
+insert into volatile_partbound_test values (current_timestamp);
+select tableoid::regclass from volatile_partbound_test;
+         tableoid         
+--------------------------
+ volatile_partbound_test2
+(1 row)
+
+drop table volatile_partbound_test;
+-- test the case where a check constraint on default partition allows
+-- to avoid scanning it when adding a new partition
+create table defcheck (a int, b int) partition by list (b);
+create table defcheck_def (a int, c int, b int);
+alter table defcheck_def drop c;
+alter table defcheck attach partition defcheck_def default;
+alter table defcheck_def add check (b <= 0 and b is not null);
+create table defcheck_1 partition of defcheck for values in (1, null);
+-- test that complex default partition constraints are enforced correctly
+insert into defcheck_def values (0, 0);
+create table defcheck_0 partition of defcheck for values in (0);
+ERROR:  updated partition constraint for default partition "defcheck_def" would be violated by some row
+drop table defcheck;
+-- tests of column drop with partition tables and indexes using
+-- predicates and expressions.
+create table part_column_drop (
+  useless_1 int,
+  id int,
+  useless_2 int,
+  d int,
+  b int,
+  useless_3 int
+) partition by range (id);
+alter table part_column_drop drop column useless_1;
+alter table part_column_drop drop column useless_2;
+alter table part_column_drop drop column useless_3;
+create index part_column_drop_b_pred on part_column_drop(b) where b = 1;
+create index part_column_drop_b_expr on part_column_drop((b = 1));
+create index part_column_drop_d_pred on part_column_drop(d) where d = 2;
+create index part_column_drop_d_expr on part_column_drop((d = 2));
+create table part_column_drop_1_10 partition of
+  part_column_drop for values from (1) to (10);
+\d part_column_drop
+    Partitioned table "public.part_column_drop"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ d      | integer |           |          | 
+ b      | integer |           |          | 
+Partition key: RANGE (id)
+Indexes:
+    "part_column_drop_b_expr" btree ((b = 1))
+    "part_column_drop_b_pred" btree (b) WHERE b = 1
+    "part_column_drop_d_expr" btree ((d = 2))
+    "part_column_drop_d_pred" btree (d) WHERE d = 2
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d part_column_drop_1_10
+       Table "public.part_column_drop_1_10"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ id     | integer |           |          | 
+ d      | integer |           |          | 
+ b      | integer |           |          | 
+Partition of: part_column_drop FOR VALUES FROM (1) TO (10)
+Indexes:
+    "part_column_drop_1_10_b_idx" btree (b) WHERE b = 1
+    "part_column_drop_1_10_d_idx" btree (d) WHERE d = 2
+    "part_column_drop_1_10_expr_idx" btree ((b = 1))
+    "part_column_drop_1_10_expr_idx1" btree ((d = 2))
+
+drop table part_column_drop;

From 31a077b2bba30ecb92f0ebae83b29d56d9b597ea Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:42:23 +0300
Subject: [PATCH 009/167] fix_pg_table_size.patch

Request relation size via smgr function, not just stat(filepath).
---
 src/backend/utils/adt/dbsize.c | 38 ++++++----------------------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 9de2ed09d99..ade36f28be5 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -23,6 +23,7 @@
 #include "commands/tablespace.h"
 #include "miscadmin.h"
 #include "storage/fd.h"
+#include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/numeric.h"
@@ -272,39 +273,12 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS)
 static int64
 calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum)
 {
-	int64		totalsize = 0;
-	char	   *relationpath;
-	char		pathname[MAXPGPATH];
-	unsigned int segcount = 0;
-
-	relationpath = relpathbackend(*rfn, backend, forknum);
-
-	for (segcount = 0;; segcount++)
-	{
-		struct stat fst;
-
-		CHECK_FOR_INTERRUPTS();
-
-		if (segcount == 0)
-			snprintf(pathname, MAXPGPATH, "%s",
-					 relationpath);
-		else
-			snprintf(pathname, MAXPGPATH, "%s.%u",
-					 relationpath, segcount);
-
-		if (stat(pathname, &fst) < 0)
-		{
-			if (errno == ENOENT)
-				break;
-			else
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not stat file \"%s\": %m", pathname)));
-		}
-		totalsize += fst.st_size;
+	SMgrRelation  srel = smgropen(*rfn, backend);
+	if (smgrexists(srel, forknum))	{
+		BlockNumber n = smgrnblocks(srel, forknum);
+		return (int64)n*BLCKSZ;
 	}
-
-	return totalsize;
+	return 0;
 }
 
 Datum

From 917462d58dc5b062e83597e09f3ed642025481a8 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:42:48 +0300
Subject: [PATCH 010/167] [walredo] fix_gin_redo.patch

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/access/gin/ginxlog.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 09ce4d6a5ba..261ab868660 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record)
 				rootbuf;
 	bool		isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0;
 	bool		isRoot = (data->flags & GIN_SPLIT_ROOT) != 0;
+	XLogRedoAction action;
 
 	/*
 	 * First clear incomplete-split flag on child page if this finishes a
@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
 	if (!isLeaf)
 		ginRedoClearIncompleteSplit(record, 3);
 
-	if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+	action = XLogReadBufferForRedo(record, 0, &lbuffer);
+	if (action != BLK_RESTORED && action != BLK_DONE)
 		elog(ERROR, "GIN split record did not contain a full-page image of left page");
 
-	if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED)
+	action = XLogReadBufferForRedo(record, 1, &rbuffer);
+	if (action != BLK_RESTORED && action != BLK_DONE)
 		elog(ERROR, "GIN split record did not contain a full-page image of right page");
 
 	if (isRoot)
 	{
-		if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED)
+		action = XLogReadBufferForRedo(record, 2, &rootbuf);
+		if (action != BLK_RESTORED && action != BLK_DONE)
 			elog(ERROR, "GIN split record did not contain a full-page image of root page");
-		UnlockReleaseBuffer(rootbuf);
+		if (rootbuf != InvalidBuffer)
+			UnlockReleaseBuffer(rootbuf);
 	}
 
-	UnlockReleaseBuffer(rbuffer);
-	UnlockReleaseBuffer(lbuffer);
+	if (rbuffer != InvalidBuffer)
+		UnlockReleaseBuffer(rbuffer);
+	if (lbuffer != InvalidBuffer)
+		UnlockReleaseBuffer(lbuffer);
 }
 
 /*

From 7276b7f000c5fae040fd821021c1071e6f08a85d Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:43:15 +0300
Subject: [PATCH 011/167] [walredo] fix_brin_redo.patch

Author: Konstantin Knizhnik <knizhnik@garret.ru>
---
 src/backend/access/brin/brin_xlog.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
index 39dc130e162..d63639ae950 100644
--- a/src/backend/access/brin/brin_xlog.c
+++ b/src/backend/access/brin/brin_xlog.c
@@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record,
 	}
 
 	/* need this page's blkno to store in revmap */
-	regpgno = BufferGetBlockNumber(buffer);
+	//ZENITH XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer.
+	XLogRecGetBlockTag(record, 0, NULL, NULL, &regpgno);
 
 	/* insert the index item into the page */
 	if (action == BLK_NEEDS_REDO)

From 6d7bafbc3aead6ed9309990930527a27d4c76a14 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:43:49 +0300
Subject: [PATCH 012/167] speculative_records_workaround.patch

---
 src/backend/access/heap/heapam.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 114a96954a6..47969b9ceac 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8882,7 +8882,7 @@ heap_xlog_insert(XLogReaderState *record)
 
 	XLogRecGetBlockTag(record, 0, &target_node, NULL, &blkno);
 	ItemPointerSetBlockNumber(&target_tid, blkno);
-	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
+	ItemPointerSetOffsetNumber(&target_tid, (xlrec->flags & XLH_INSERT_IS_SPECULATIVE) ? SpecTokenOffsetNumber : xlrec->offnum);
 
 	/*
 	 * The visibility map may need to be fixed even if the heap page is

From 87b826f6258665892d44cabbf0fe594ed9bb2754 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:45:56 +0300
Subject: [PATCH 013/167] wallog_t_ctid.patch

---
 src/backend/access/heap/heapam.c | 24 ++++++++++++++++--------
 src/include/access/heapam_xlog.h |  6 +++++-
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 47969b9ceac..09e0329e6f4 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2189,6 +2189,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
 		xlhdr.t_infomask = heaptup->t_data->t_infomask;
 		xlhdr.t_hoff = heaptup->t_data->t_hoff;
+		xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data);
 
 		/*
 		 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
@@ -2507,6 +2508,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 
 				tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
 				tuphdr->t_infomask = heaptup->t_data->t_infomask;
+				tuphdr->t_cid =  HeapTupleHeaderGetRawCommandId(heaptup->t_data);
 				tuphdr->t_hoff = heaptup->t_data->t_hoff;
 
 				/* write bitmap [+ padding] [+ oid] + data */
@@ -3011,7 +3013,7 @@ heap_delete(Relation relation, ItemPointer tid,
 											  tp.t_data->t_infomask2);
 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
 		xlrec.xmax = new_xmax;
-
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data);
 		if (old_key_tuple != NULL)
 		{
 			if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
@@ -3032,6 +3034,7 @@ heap_delete(Relation relation, ItemPointer tid,
 		{
 			xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
 			xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
+			xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data);
 			xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
 
 			XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
@@ -3748,6 +3751,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 												  oldtup.t_data->t_infomask2);
 			xlrec.flags =
 				cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
+			xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data);
 			XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
 			recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
 			PageSetLSN(page, recptr);
@@ -4894,6 +4898,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple,
 		xlrec.infobits_set = compute_infobits(new_infomask,
 											  tuple->t_data->t_infomask2);
 		xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data);
 		XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
 
 		/* we don't decode row locks atm, so no need to log the origin */
@@ -5943,6 +5948,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
 											  tp.t_data->t_infomask2);
 		xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
 		xlrec.xmax = xid;
+		xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data);
 
 		XLogBeginInsert();
 		XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
@@ -8130,7 +8136,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	/* Prepare WAL data for the new page */
 	xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
 	xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
-
+	xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data);
 	bufflags = REGBUF_STANDARD;
 	if (init)
 		bufflags |= REGBUF_WILL_INIT;
@@ -8167,6 +8173,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 	xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
 	xlhdr.t_infomask = newtup->t_data->t_infomask;
 	xlhdr.t_hoff = newtup->t_data->t_hoff;
+	xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data);
 	Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
 
 	/*
@@ -8208,6 +8215,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
 		xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
 		xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
 		xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
+		xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data);
 
 		XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
 
@@ -8839,7 +8847,7 @@ heap_xlog_delete(XLogReaderState *record)
 			HeapTupleHeaderSetXmax(htup, xlrec->xmax);
 		else
 			HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
-		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 
 		/* Mark the page as a candidate for pruning */
 		PageSetPrunable(page, XLogRecGetXid(record));
@@ -8940,7 +8948,7 @@ heap_xlog_insert(XLogReaderState *record)
 		htup->t_infomask = xlhdr.t_infomask;
 		htup->t_hoff = xlhdr.t_hoff;
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, FirstCommandId);
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
 		htup->t_ctid = target_tid;
 
 		if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
@@ -9083,7 +9091,7 @@ heap_xlog_multi_insert(XLogReaderState *record)
 			htup->t_infomask = xlhdr->t_infomask;
 			htup->t_hoff = xlhdr->t_hoff;
 			HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-			HeapTupleHeaderSetCmin(htup, FirstCommandId);
+			HeapTupleHeaderSetCmin(htup, xlhdr->t_cid);
 			ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
 			ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
 
@@ -9223,7 +9231,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update)
 		fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
 								   &htup->t_infomask2);
 		HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
-		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 		/* Set forward chain link in t_ctid */
 		htup->t_ctid = newtid;
 
@@ -9356,7 +9364,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update)
 		htup->t_hoff = xlhdr.t_hoff;
 
 		HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
-		HeapTupleHeaderSetCmin(htup, FirstCommandId);
+		HeapTupleHeaderSetCmin(htup, xlhdr.t_cid);
 		HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
 		/* Make sure there is no forward chain link in t_ctid */
 		htup->t_ctid = newtid;
@@ -9497,7 +9505,7 @@ heap_xlog_lock(XLogReaderState *record)
 						   offnum);
 		}
 		HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
-		HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
+		HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false);
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
 	}
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 27db48184e6..e6d31be5222 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -108,6 +108,7 @@ typedef struct xl_heap_delete
 {
 	TransactionId xmax;			/* xmax of the deleted tuple */
 	OffsetNumber offnum;		/* deleted tuple's offset */
+	uint32      t_cid;
 	uint8		infobits_set;	/* infomask bits */
 	uint8		flags;
 } xl_heap_delete;
@@ -145,6 +146,7 @@ typedef struct xl_heap_header
 {
 	uint16		t_infomask2;
 	uint16		t_infomask;
+	uint32      t_cid;
 	uint8		t_hoff;
 } xl_heap_header;
 
@@ -186,6 +188,7 @@ typedef struct xl_multi_insert_tuple
 	uint16		datalen;		/* size of tuple data that follows */
 	uint16		t_infomask2;
 	uint16		t_infomask;
+	uint32      t_cid;
 	uint8		t_hoff;
 	/* TUPLE DATA FOLLOWS AT END OF STRUCT */
 } xl_multi_insert_tuple;
@@ -215,9 +218,9 @@ typedef struct xl_heap_update
 	OffsetNumber old_offnum;	/* old tuple's offset */
 	uint8		old_infobits_set;	/* infomask bits to set on old tuple */
 	uint8		flags;
+	uint32       t_cid;
 	TransactionId new_xmax;		/* xmax of the new tuple */
 	OffsetNumber new_offnum;	/* new tuple's offset */
-
 	/*
 	 * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags
 	 * are set, xl_heap_header and tuple data for the old tuple follow.
@@ -279,6 +282,7 @@ typedef struct xl_heap_lock
 {
 	TransactionId locking_xid;	/* might be a MultiXactId not xid */
 	OffsetNumber offnum;		/* locked tuple's offset on page */
+	uint32       t_cid;
 	int8		infobits_set;	/* infomask and infomask2 bits to set */
 	uint8		flags;			/* XLH_LOCK_* flag bits */
 } xl_heap_lock;

From 9f457bf9605ef2388645f687829ebd419e433ff6 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:47:00 +0300
Subject: [PATCH 014/167] vacuumlazy_debug_stub.patch

---
 src/backend/access/heap/vacuumlazy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 8aab6e324e0..c684c4fbee3 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
 		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
 				 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
 		{
-			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+			/* ZENITH-XXX: all visible hint is not wal-logged
+			 * FIXME: Replay visibilitymap changes in pageserver
+			 */
+			elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
 				 vacrel->relname, blkno);
 			visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
 								VISIBILITYMAP_VALID_BITS);

From ac7bd68afa7f619d4885ec940bc78c5bcb701907 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:48:33 +0300
Subject: [PATCH 015/167] [test] zenith_test_evict.patch

---
 src/backend/storage/buffer/bufmgr.c | 29 +++++++++++++++++++++++++++++
 src/backend/utils/misc/guc.c        | 11 +++++++++++
 src/include/storage/bufmgr.h        |  2 ++
 3 files changed, 42 insertions(+)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f0518f9ecc4..20df561cfa9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -157,6 +157,9 @@ int			checkpoint_flush_after = 0;
 int			bgwriter_flush_after = 0;
 int			backend_flush_after = 0;
 
+/* Evict unpinned pages (for better test coverage) */
+bool		zenith_test_evict = false;
+
 /* local state for StartBufferIO and related functions */
 static BufferDesc *InProgressBuf = NULL;
 static bool IsForInput;
@@ -1920,6 +1923,32 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 				UnlockBufHdr(buf, buf_state);
 		}
 		ForgetPrivateRefCountEntry(ref);
+
+		if (zenith_test_evict && !InRecovery)
+		{
+			buf_state = LockBufHdr(buf);
+			if (BUF_STATE_GET_REFCOUNT(buf_state) == 0)
+			{
+				if (buf_state & BM_DIRTY)
+				{
+					ReservePrivateRefCountEntry();
+					PinBuffer_Locked(buf);
+					if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+												 LW_SHARED))
+					{
+						FlushOneBuffer(b);
+						LWLockRelease(BufferDescriptorGetContentLock(buf));
+					}
+					UnpinBuffer(buf, true);
+				}
+				else
+				{
+					InvalidateBuffer(buf);
+				}
+			}
+			else
+				UnlockBufHdr(buf, buf_state);
+		}
 	}
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2a7f8136142..71b43a51ef4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -87,6 +87,7 @@
 #include "storage/pg_shmem.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
+#include "storage/smgr.h"
 #include "storage/standby.h"
 #include "tcop/tcopprot.h"
 #include "tsearch/ts_cache.h"
@@ -2112,6 +2113,16 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"zenith_test_evict", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Evict unpinned pages (for better test coverage)"),
+		},
+		&zenith_test_evict,
+		false,
+		NULL, NULL, NULL
+	},
+
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index aa64fb42ec4..6d140786c74 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -76,6 +76,8 @@ extern int	checkpoint_flush_after;
 extern int	backend_flush_after;
 extern int	bgwriter_flush_after;
 
+extern bool	zenith_test_evict;
+
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
 

From 74ae59fb4189b641b4caa5b363e3e5688b92dcd0 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Sun, 16 May 2021 13:53:07 +0300
Subject: [PATCH 016/167] fix_sequence_wallogging.patch

---
 src/backend/commands/sequence.c          |   4 +-
 src/test/regress/expected/sequence_1.out | 824 +++++++++++++++++++++++
 2 files changed, 827 insertions(+), 1 deletion(-)
 create mode 100644 src/test/regress/expected/sequence_1.out

diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 0415df9ccb7..9f9db3c8bcd 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -53,7 +53,9 @@
  * so we pre-log a few fetches in advance. In the event of
  * crash we can lose (skip over) as many values as we pre-logged.
  */
-#define SEQ_LOG_VALS	32
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS	32 */
+#define SEQ_LOG_VALS	0
 
 /*
  * The "special area" of a sequence's buffer page looks like this.
diff --git a/src/test/regress/expected/sequence_1.out b/src/test/regress/expected/sequence_1.out
new file mode 100644
index 00000000000..462e3f3caa4
--- /dev/null
+++ b/src/test/regress/expected/sequence_1.out
@@ -0,0 +1,824 @@
+--
+-- CREATE SEQUENCE
+--
+-- various error cases
+CREATE UNLOGGED SEQUENCE sequence_testx;
+ERROR:  unlogged sequences are not supported
+CREATE SEQUENCE sequence_testx INCREMENT BY 0;
+ERROR:  INCREMENT must not be zero
+CREATE SEQUENCE sequence_testx INCREMENT BY -1 MINVALUE 20;
+ERROR:  MINVALUE (20) must be less than MAXVALUE (-1)
+CREATE SEQUENCE sequence_testx INCREMENT BY 1 MAXVALUE -20;
+ERROR:  MINVALUE (1) must be less than MAXVALUE (-20)
+CREATE SEQUENCE sequence_testx INCREMENT BY -1 START 10;
+ERROR:  START value (10) cannot be greater than MAXVALUE (-1)
+CREATE SEQUENCE sequence_testx INCREMENT BY 1 START -10;
+ERROR:  START value (-10) cannot be less than MINVALUE (1)
+CREATE SEQUENCE sequence_testx CACHE 0;
+ERROR:  CACHE (0) must be greater than zero
+-- OWNED BY errors
+CREATE SEQUENCE sequence_testx OWNED BY nobody;  -- nonsense word
+ERROR:  invalid OWNED BY option
+HINT:  Specify OWNED BY table.column or OWNED BY NONE.
+CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid;  -- not a table
+ERROR:  referenced relation "pg_class_oid_index" is not a table or foreign table
+CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname;  -- not same schema
+ERROR:  sequence must be in same schema as table it is linked to
+CREATE TABLE sequence_test_table (a int);
+CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b;  -- wrong column
+ERROR:  column "b" of relation "sequence_test_table" does not exist
+DROP TABLE sequence_test_table;
+-- sequence data types
+CREATE SEQUENCE sequence_test5 AS integer;
+CREATE SEQUENCE sequence_test6 AS smallint;
+CREATE SEQUENCE sequence_test7 AS bigint;
+CREATE SEQUENCE sequence_test8 AS integer MAXVALUE 100000;
+CREATE SEQUENCE sequence_test9 AS integer INCREMENT BY -1;
+CREATE SEQUENCE sequence_test10 AS integer MINVALUE -100000 START 1;
+CREATE SEQUENCE sequence_test11 AS smallint;
+CREATE SEQUENCE sequence_test12 AS smallint INCREMENT -1;
+CREATE SEQUENCE sequence_test13 AS smallint MINVALUE -32768;
+CREATE SEQUENCE sequence_test14 AS smallint MAXVALUE 32767 INCREMENT -1;
+CREATE SEQUENCE sequence_testx AS text;
+ERROR:  sequence type must be smallint, integer, or bigint
+CREATE SEQUENCE sequence_testx AS nosuchtype;
+ERROR:  type "nosuchtype" does not exist
+LINE 1: CREATE SEQUENCE sequence_testx AS nosuchtype;
+                                          ^
+CREATE SEQUENCE sequence_testx AS smallint MAXVALUE 100000;
+ERROR:  MAXVALUE (100000) is out of range for sequence data type smallint
+CREATE SEQUENCE sequence_testx AS smallint MINVALUE -100000;
+ERROR:  MINVALUE (-100000) is out of range for sequence data type smallint
+ALTER SEQUENCE sequence_test5 AS smallint;  -- success, max will be adjusted
+ALTER SEQUENCE sequence_test8 AS smallint;  -- fail, max has to be adjusted
+ERROR:  MAXVALUE (100000) is out of range for sequence data type smallint
+ALTER SEQUENCE sequence_test8 AS smallint MAXVALUE 20000;  -- ok now
+ALTER SEQUENCE sequence_test9 AS smallint;  -- success, min will be adjusted
+ALTER SEQUENCE sequence_test10 AS smallint;  -- fail, min has to be adjusted
+ERROR:  MINVALUE (-100000) is out of range for sequence data type smallint
+ALTER SEQUENCE sequence_test10 AS smallint MINVALUE -20000;  -- ok now
+ALTER SEQUENCE sequence_test11 AS int;  -- max will be adjusted
+ALTER SEQUENCE sequence_test12 AS int;  -- min will be adjusted
+ALTER SEQUENCE sequence_test13 AS int;  -- min and max will be adjusted
+ALTER SEQUENCE sequence_test14 AS int;  -- min and max will be adjusted
+---
+--- test creation of SERIAL column
+---
+CREATE TABLE serialTest1 (f1 text, f2 serial);
+INSERT INTO serialTest1 VALUES ('foo');
+INSERT INTO serialTest1 VALUES ('bar');
+INSERT INTO serialTest1 VALUES ('force', 100);
+INSERT INTO serialTest1 VALUES ('wrong', NULL);
+ERROR:  null value in column "f2" of relation "serialtest1" violates not-null constraint
+DETAIL:  Failing row contains (wrong, null).
+SELECT * FROM serialTest1;
+  f1   | f2  
+-------+-----
+ foo   |   1
+ bar   |   2
+ force | 100
+(3 rows)
+
+SELECT pg_get_serial_sequence('serialTest1', 'f2');
+  pg_get_serial_sequence   
+---------------------------
+ public.serialtest1_f2_seq
+(1 row)
+
+-- test smallserial / bigserial
+CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2,
+  f5 bigserial, f6 serial8);
+INSERT INTO serialTest2 (f1)
+  VALUES ('test_defaults');
+INSERT INTO serialTest2 (f1, f2, f3, f4, f5, f6)
+  VALUES ('test_max_vals', 2147483647, 32767, 32767, 9223372036854775807,
+          9223372036854775807),
+         ('test_min_vals', -2147483648, -32768, -32768, -9223372036854775808,
+          -9223372036854775808);
+-- All these INSERTs should fail:
+INSERT INTO serialTest2 (f1, f3)
+  VALUES ('bogus', -32769);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f4)
+  VALUES ('bogus', -32769);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f3)
+  VALUES ('bogus', 32768);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f4)
+  VALUES ('bogus', 32768);
+ERROR:  smallint out of range
+INSERT INTO serialTest2 (f1, f5)
+  VALUES ('bogus', -9223372036854775809);
+ERROR:  bigint out of range
+INSERT INTO serialTest2 (f1, f6)
+  VALUES ('bogus', -9223372036854775809);
+ERROR:  bigint out of range
+INSERT INTO serialTest2 (f1, f5)
+  VALUES ('bogus', 9223372036854775808);
+ERROR:  bigint out of range
+INSERT INTO serialTest2 (f1, f6)
+  VALUES ('bogus', 9223372036854775808);
+ERROR:  bigint out of range
+SELECT * FROM serialTest2 ORDER BY f2 ASC;
+      f1       |     f2      |   f3   |   f4   |          f5          |          f6          
+---------------+-------------+--------+--------+----------------------+----------------------
+ test_min_vals | -2147483648 | -32768 | -32768 | -9223372036854775808 | -9223372036854775808
+ test_defaults |           1 |      1 |      1 |                    1 |                    1
+ test_max_vals |  2147483647 |  32767 |  32767 |  9223372036854775807 |  9223372036854775807
+(3 rows)
+
+SELECT nextval('serialTest2_f2_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f3_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f4_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f5_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('serialTest2_f6_seq');
+ nextval 
+---------
+       2
+(1 row)
+
+-- basic sequence operations using both text and oid references
+CREATE SEQUENCE sequence_test;
+CREATE SEQUENCE IF NOT EXISTS sequence_test;
+NOTICE:  relation "sequence_test" already exists, skipping
+SELECT nextval('sequence_test'::text);
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('sequence_test'::regclass);
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT currval('sequence_test'::text);
+ currval 
+---------
+       2
+(1 row)
+
+SELECT currval('sequence_test'::regclass);
+ currval 
+---------
+       2
+(1 row)
+
+SELECT setval('sequence_test'::text, 32);
+ setval 
+--------
+     32
+(1 row)
+
+SELECT nextval('sequence_test'::regclass);
+ nextval 
+---------
+      33
+(1 row)
+
+SELECT setval('sequence_test'::text, 99, false);
+ setval 
+--------
+     99
+(1 row)
+
+SELECT nextval('sequence_test'::regclass);
+ nextval 
+---------
+      99
+(1 row)
+
+SELECT setval('sequence_test'::regclass, 32);
+ setval 
+--------
+     32
+(1 row)
+
+SELECT nextval('sequence_test'::text);
+ nextval 
+---------
+      33
+(1 row)
+
+SELECT setval('sequence_test'::regclass, 99, false);
+ setval 
+--------
+     99
+(1 row)
+
+SELECT nextval('sequence_test'::text);
+ nextval 
+---------
+      99
+(1 row)
+
+DISCARD SEQUENCES;
+SELECT currval('sequence_test'::regclass);
+ERROR:  currval of sequence "sequence_test" is not yet defined in this session
+DROP SEQUENCE sequence_test;
+-- renaming sequences
+CREATE SEQUENCE foo_seq;
+ALTER TABLE foo_seq RENAME TO foo_seq_new;
+SELECT * FROM foo_seq_new;
+ last_value | log_cnt | is_called 
+------------+---------+-----------
+          1 |       0 | f
+(1 row)
+
+SELECT nextval('foo_seq_new');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('foo_seq_new');
+ nextval 
+---------
+       2
+(1 row)
+
+-- log_cnt can be higher if there is a checkpoint just at the right
+-- time, so just test for the expected range
+SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new;
+ last_value | log_cnt_ok | is_called 
+------------+------------+-----------
+          2 | f          | t
+(1 row)
+
+DROP SEQUENCE foo_seq_new;
+-- renaming serial sequences
+ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo;
+INSERT INTO serialTest1 VALUES ('more');
+SELECT * FROM serialTest1;
+  f1   | f2  
+-------+-----
+ foo   |   1
+ bar   |   2
+ force | 100
+ more  |   3
+(4 rows)
+
+--
+-- Check dependencies of serial and ordinary sequences
+--
+CREATE TEMP SEQUENCE myseq2;
+CREATE TEMP SEQUENCE myseq3;
+CREATE TEMP TABLE t1 (
+  f1 serial,
+  f2 int DEFAULT nextval('myseq2'),
+  f3 int DEFAULT nextval('myseq3'::text)
+);
+-- Both drops should fail, but with different error messages:
+DROP SEQUENCE t1_f1_seq;
+ERROR:  cannot drop sequence t1_f1_seq because other objects depend on it
+DETAIL:  default value for column f1 of table t1 depends on sequence t1_f1_seq
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+DROP SEQUENCE myseq2;
+ERROR:  cannot drop sequence myseq2 because other objects depend on it
+DETAIL:  default value for column f2 of table t1 depends on sequence myseq2
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+-- This however will work:
+DROP SEQUENCE myseq3;
+DROP TABLE t1;
+-- Fails because no longer existent:
+DROP SEQUENCE t1_f1_seq;
+ERROR:  sequence "t1_f1_seq" does not exist
+-- Now OK:
+DROP SEQUENCE myseq2;
+--
+-- Alter sequence
+--
+ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 24
+  INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE;
+NOTICE:  relation "sequence_test2" does not exist, skipping
+ALTER SEQUENCE serialTest1 CYCLE;  -- error, not a sequence
+ERROR:  "serialtest1" is not a sequence
+CREATE SEQUENCE sequence_test2 START WITH 32;
+CREATE SEQUENCE sequence_test4 INCREMENT BY -1;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+SELECT nextval('sequence_test4');
+ nextval 
+---------
+      -1
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART WITH 0;  -- error
+ERROR:  RESTART value (0) cannot be less than MINVALUE (1)
+ALTER SEQUENCE sequence_test4 RESTART WITH 40;  -- error
+ERROR:  RESTART value (40) cannot be greater than MAXVALUE (-1)
+-- test CYCLE and NO CYCLE
+ALTER SEQUENCE sequence_test2 RESTART WITH 24
+  INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- cycled
+ nextval 
+---------
+       5
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART WITH 24
+  NO CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+      36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- error
+ERROR:  nextval: reached maximum value of sequence "sequence_test2" (36)
+ALTER SEQUENCE sequence_test2 RESTART WITH -24 START WITH -24
+  INCREMENT BY -4 MINVALUE -36 MAXVALUE -5 CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- cycled
+ nextval 
+---------
+      -5
+(1 row)
+
+ALTER SEQUENCE sequence_test2 RESTART WITH -24
+  NO CYCLE;
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -24
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -28
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -32
+(1 row)
+
+SELECT nextval('sequence_test2');
+ nextval 
+---------
+     -36
+(1 row)
+
+SELECT nextval('sequence_test2');  -- error
+ERROR:  nextval: reached minimum value of sequence "sequence_test2" (-36)
+-- reset
+ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 32 START WITH 32
+  INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE;
+SELECT setval('sequence_test2', -100);  -- error
+ERROR:  setval: value -100 is out of bounds for sequence "sequence_test2" (5..36)
+SELECT setval('sequence_test2', 100);  -- error
+ERROR:  setval: value 100 is out of bounds for sequence "sequence_test2" (5..36)
+SELECT setval('sequence_test2', 5);
+ setval 
+--------
+      5
+(1 row)
+
+CREATE SEQUENCE sequence_test3;  -- not read from, to test is_called
+-- Information schema
+SELECT * FROM information_schema.sequences
+  WHERE sequence_name ~ ANY(ARRAY['sequence_test', 'serialtest'])
+  ORDER BY sequence_name ASC;
+ sequence_catalog | sequence_schema |   sequence_name    | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value |    minimum_value     |    maximum_value    | increment | cycle_option 
+------------------+-----------------+--------------------+-----------+-------------------+-------------------------+---------------+-------------+----------------------+---------------------+-----------+--------------
+ regression       | public          | sequence_test10    | smallint  |                16 |                       2 |             0 | 1           | -20000               | 32767               | 1         | NO
+ regression       | public          | sequence_test11    | integer   |                32 |                       2 |             0 | 1           | 1                    | 2147483647          | 1         | NO
+ regression       | public          | sequence_test12    | integer   |                32 |                       2 |             0 | -1          | -2147483648          | -1                  | -1        | NO
+ regression       | public          | sequence_test13    | integer   |                32 |                       2 |             0 | -32768      | -2147483648          | 2147483647          | 1         | NO
+ regression       | public          | sequence_test14    | integer   |                32 |                       2 |             0 | 32767       | -2147483648          | 2147483647          | -1        | NO
+ regression       | public          | sequence_test2     | bigint    |                64 |                       2 |             0 | 32          | 5                    | 36                  | 4         | YES
+ regression       | public          | sequence_test3     | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+ regression       | public          | sequence_test4     | bigint    |                64 |                       2 |             0 | -1          | -9223372036854775808 | -1                  | -1        | NO
+ regression       | public          | sequence_test5     | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | sequence_test6     | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | sequence_test7     | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+ regression       | public          | sequence_test8     | smallint  |                16 |                       2 |             0 | 1           | 1                    | 20000               | 1         | NO
+ regression       | public          | sequence_test9     | smallint  |                16 |                       2 |             0 | -1          | -32768               | -1                  | -1        | NO
+ regression       | public          | serialtest1_f2_foo | integer   |                32 |                       2 |             0 | 1           | 1                    | 2147483647          | 1         | NO
+ regression       | public          | serialtest2_f2_seq | integer   |                32 |                       2 |             0 | 1           | 1                    | 2147483647          | 1         | NO
+ regression       | public          | serialtest2_f3_seq | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | serialtest2_f4_seq | smallint  |                16 |                       2 |             0 | 1           | 1                    | 32767               | 1         | NO
+ regression       | public          | serialtest2_f5_seq | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+ regression       | public          | serialtest2_f6_seq | bigint    |                64 |                       2 |             0 | 1           | 1                    | 9223372036854775807 | 1         | NO
+(19 rows)
+
+SELECT schemaname, sequencename, start_value, min_value, max_value, increment_by, cycle, cache_size, last_value
+FROM pg_sequences
+WHERE sequencename ~ ANY(ARRAY['sequence_test', 'serialtest'])
+  ORDER BY sequencename ASC;
+ schemaname |    sequencename    | start_value |      min_value       |      max_value      | increment_by | cycle | cache_size | last_value 
+------------+--------------------+-------------+----------------------+---------------------+--------------+-------+------------+------------
+ public     | sequence_test10    |           1 |               -20000 |               32767 |            1 | f     |          1 |           
+ public     | sequence_test11    |           1 |                    1 |          2147483647 |            1 | f     |          1 |           
+ public     | sequence_test12    |          -1 |          -2147483648 |                  -1 |           -1 | f     |          1 |           
+ public     | sequence_test13    |      -32768 |          -2147483648 |          2147483647 |            1 | f     |          1 |           
+ public     | sequence_test14    |       32767 |          -2147483648 |          2147483647 |           -1 | f     |          1 |           
+ public     | sequence_test2     |          32 |                    5 |                  36 |            4 | t     |          1 |          5
+ public     | sequence_test3     |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |           
+ public     | sequence_test4     |          -1 | -9223372036854775808 |                  -1 |           -1 | f     |          1 |         -1
+ public     | sequence_test5     |           1 |                    1 |               32767 |            1 | f     |          1 |           
+ public     | sequence_test6     |           1 |                    1 |               32767 |            1 | f     |          1 |           
+ public     | sequence_test7     |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |           
+ public     | sequence_test8     |           1 |                    1 |               20000 |            1 | f     |          1 |           
+ public     | sequence_test9     |          -1 |               -32768 |                  -1 |           -1 | f     |          1 |           
+ public     | serialtest1_f2_foo |           1 |                    1 |          2147483647 |            1 | f     |          1 |          3
+ public     | serialtest2_f2_seq |           1 |                    1 |          2147483647 |            1 | f     |          1 |          2
+ public     | serialtest2_f3_seq |           1 |                    1 |               32767 |            1 | f     |          1 |          2
+ public     | serialtest2_f4_seq |           1 |                    1 |               32767 |            1 | f     |          1 |          2
+ public     | serialtest2_f5_seq |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |          2
+ public     | serialtest2_f6_seq |           1 |                    1 | 9223372036854775807 |            1 | f     |          1 |          2
+(19 rows)
+
+SELECT * FROM pg_sequence_parameters('sequence_test4'::regclass);
+ start_value |    minimum_value     | maximum_value | increment | cycle_option | cache_size | data_type 
+-------------+----------------------+---------------+-----------+--------------+------------+-----------
+          -1 | -9223372036854775808 |            -1 |        -1 | f            |          1 |        20
+(1 row)
+
+\d sequence_test4
+                       Sequence "public.sequence_test4"
+  Type  | Start |       Minimum        | Maximum | Increment | Cycles? | Cache 
+--------+-------+----------------------+---------+-----------+---------+-------
+ bigint |    -1 | -9223372036854775808 |      -1 |        -1 | no      |     1
+
+\d serialtest2_f2_seq
+                 Sequence "public.serialtest2_f2_seq"
+  Type   | Start | Minimum |  Maximum   | Increment | Cycles? | Cache 
+---------+-------+---------+------------+-----------+---------+-------
+ integer |     1 |       1 | 2147483647 |         1 | no      |     1
+Owned by: public.serialtest2.f2
+
+-- Test comments
+COMMENT ON SEQUENCE asdf IS 'won''t work';
+ERROR:  relation "asdf" does not exist
+COMMENT ON SEQUENCE sequence_test2 IS 'will work';
+COMMENT ON SEQUENCE sequence_test2 IS NULL;
+-- Test lastval()
+CREATE SEQUENCE seq;
+SELECT nextval('seq');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+SELECT setval('seq', 99);
+ setval 
+--------
+     99
+(1 row)
+
+SELECT lastval();
+ lastval 
+---------
+      99
+(1 row)
+
+DISCARD SEQUENCES;
+SELECT lastval();
+ERROR:  lastval is not yet defined in this session
+CREATE SEQUENCE seq2;
+SELECT nextval('seq2');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+DROP SEQUENCE seq2;
+-- should fail
+SELECT lastval();
+ERROR:  lastval is not yet defined in this session
+-- Test sequences in read-only transactions
+CREATE TEMPORARY SEQUENCE sequence_test_temp1;
+START TRANSACTION READ ONLY;
+SELECT nextval('sequence_test_temp1');  -- ok
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('sequence_test2');  -- error
+ERROR:  cannot execute nextval() in a read-only transaction
+ROLLBACK;
+START TRANSACTION READ ONLY;
+SELECT setval('sequence_test_temp1', 1);  -- ok
+ setval 
+--------
+      1
+(1 row)
+
+SELECT setval('sequence_test2', 1);  -- error
+ERROR:  cannot execute setval() in a read-only transaction
+ROLLBACK;
+-- privileges tests
+CREATE USER regress_seq_user;
+-- nextval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT SELECT ON seq3 TO regress_seq_user;
+SELECT nextval('seq3');
+ERROR:  permission denied for sequence seq3
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT USAGE ON seq3 TO regress_seq_user;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+-- currval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT SELECT ON seq3 TO regress_seq_user;
+SELECT currval('seq3');
+ currval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT currval('seq3');
+ERROR:  permission denied for sequence seq3
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT USAGE ON seq3 TO regress_seq_user;
+SELECT currval('seq3');
+ currval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+-- lastval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT SELECT ON seq3 TO regress_seq_user;
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT lastval();
+ERROR:  permission denied for sequence seq3
+ROLLBACK;
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+SELECT nextval('seq3');
+ nextval 
+---------
+       1
+(1 row)
+
+REVOKE ALL ON seq3 FROM regress_seq_user;
+GRANT USAGE ON seq3 TO regress_seq_user;
+SELECT lastval();
+ lastval 
+---------
+       1
+(1 row)
+
+ROLLBACK;
+-- setval
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+CREATE SEQUENCE seq3;
+REVOKE ALL ON seq3 FROM regress_seq_user;
+SAVEPOINT save;
+SELECT setval('seq3', 5);
+ERROR:  permission denied for sequence seq3
+ROLLBACK TO save;
+GRANT UPDATE ON seq3 TO regress_seq_user;
+SELECT setval('seq3', 5);
+ setval 
+--------
+      5
+(1 row)
+
+SELECT nextval('seq3');
+ nextval 
+---------
+       6
+(1 row)
+
+ROLLBACK;
+-- ALTER SEQUENCE
+BEGIN;
+SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+ALTER SEQUENCE sequence_test2 START WITH 1;
+ERROR:  must be owner of sequence sequence_test2
+ROLLBACK;
+-- Sequences should get wiped out as well:
+DROP TABLE serialTest1, serialTest2;
+-- Make sure sequences are gone:
+SELECT * FROM information_schema.sequences WHERE sequence_name IN
+  ('sequence_test2', 'serialtest2_f2_seq', 'serialtest2_f3_seq',
+   'serialtest2_f4_seq', 'serialtest2_f5_seq', 'serialtest2_f6_seq')
+  ORDER BY sequence_name ASC;
+ sequence_catalog | sequence_schema | sequence_name  | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value | minimum_value | maximum_value | increment | cycle_option 
+------------------+-----------------+----------------+-----------+-------------------+-------------------------+---------------+-------------+---------------+---------------+-----------+--------------
+ regression       | public          | sequence_test2 | bigint    |                64 |                       2 |             0 | 32          | 5             | 36            | 4         | YES
+(1 row)
+
+DROP USER regress_seq_user;
+DROP SEQUENCE seq;
+-- cache tests
+CREATE SEQUENCE test_seq1 CACHE 10;
+SELECT nextval('test_seq1');
+ nextval 
+---------
+       1
+(1 row)
+
+SELECT nextval('test_seq1');
+ nextval 
+---------
+       2
+(1 row)
+
+SELECT nextval('test_seq1');
+ nextval 
+---------
+       3
+(1 row)
+
+DROP SEQUENCE test_seq1;

From e27aaeaa357d2ce7404a50c0d8d82a1ada5a3cd3 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Tue, 18 May 2021 12:24:06 +0300
Subject: [PATCH 017/167] Bring back change that got lost in refactoring.
 silence ReadBuffer_common error. TODO: add a comment, why this is fine for
 zenith.

---
 src/backend/storage/buffer/bufmgr.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 20df561cfa9..1daff7125b4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -925,11 +925,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 		if (!PageIsNew((Page) bufBlock))
-			ereport(ERROR,
+		{
+			 // XXX-ZENITH
+			 MemSet((char *) bufBlock, 0, BLCKSZ);
+			 ereport(DEBUG1,
 					(errmsg("unexpected data beyond EOF in block %u of relation %s",
 							blockNum, relpath(smgr->smgr_rnode, forkNum)),
 					 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
-
+		}
 		/*
 		 * We *must* do smgrextend before succeeding, else the page will not
 		 * be reserved by the kernel, and the next P_NEW call will decide to

From 6ac8b6e6f7fb8a82a0e8d6cc5e5ec8c4fb390252 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 7 Jun 2021 16:00:42 +0300
Subject: [PATCH 018/167] [contrib/zenith] [refer #225] if insert WAL position
 points at the end of WAL page header, then return it back to the page origin

---
 contrib/zenith/pagestore_smgr.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 3a91d80b926..19e39ffeb74 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -16,6 +16,7 @@
 
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlog_internal.h"
 #include "pagestore_client.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
@@ -358,6 +359,29 @@ zenith_init(void)
 #endif
 }
 
+/*
+ * GetXLogInsertRecPtr uses XLogBytePosToRecPtr to convert logical insert (reserved) position
+ * to physical position in WAL. It always adds SizeOfXLogShortPHD:
+ *		seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
+ * so even if there are no records on the page, offset will be SizeOfXLogShortPHD.
+ * It may cause problems with XLogFlush. So return pointer backward to the origin of the page.
+ */
+static XLogRecPtr
+zm_adjust_lsn(XLogRecPtr lsn)
+{
+	/* If lsn points to the beging of first record on page or segment,
+	 * then "return" it back to the page origin
+	 */
+	if ((lsn & (XLOG_BLCKSZ-1)) == SizeOfXLogShortPHD)
+	{
+		lsn -= SizeOfXLogShortPHD;
+	}
+	else if ((lsn & (wal_segment_size-1)) == SizeOfXLogLongPHD)
+	{
+		lsn -= SizeOfXLogLongPHD;
+	}
+	return lsn;
+}
 
 /*
  * Return LSN for requesting pages and number of blocks from page server
@@ -388,7 +412,6 @@ zenith_get_request_lsn(bool nonrel)
 	}
 	else
 	{
-		lsn = GetLastWrittenPageLSN();
 		flushlsn = GetFlushRecPtr();
 
 		/*
@@ -412,6 +435,8 @@ zenith_get_request_lsn(bool nonrel)
 			elog(DEBUG1, "zenith_get_request_lsn GetFlushRecPtr lsn %X/%X",
 				 (uint32) ((lsn) >> 32), (uint32) (lsn));
 		}
+		else
+			lsn = zm_adjust_lsn(lsn);
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
@@ -858,6 +883,8 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	lsn = GetXLogInsertRecPtr();
 
+	lsn = zm_adjust_lsn(lsn);
+
 	/*
 	 * Flush it, too. We don't actually care about it here, but let's uphold
 	 * the invariant that last-written LSN <= flush LSN.

From 8360d74000c00cbc07e7b59783761ce523f81d5f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 8 Jun 2021 18:44:41 +0300
Subject: [PATCH 019/167] [walproposer] Create replication slot for walproposer
 to avoid loose of WAL at compute node + Check for presence of replication
 slot

---
 src/backend/replication/walproposer.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 564defc024a..d71061765b8 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -11,6 +11,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "access/xlog.h"
+#include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
@@ -24,6 +25,8 @@ char* wal_acceptors_list;
 int   wal_acceptor_reconnect_timeout;
 bool  am_wal_proposer;
 
+#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
+
 static int          n_walkeepers = 0;
 static int          quorum = 0;
 static WalKeeper    walkeeper[MAX_WALKEEPERS];
@@ -293,6 +296,13 @@ WalProposerMain(Datum main_arg)
 	InitWalSender();
 	ResetWalProposerEventSet();
 
+	/* Create replication slot for WAL proposer if not exists */
+	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
+	{
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotRelease();
+	}
+
 	/* Initiate connections to all walkeeper nodes */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -312,7 +322,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	 */
 	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
 
-	cmd.slotname = NULL;
+	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;
 	StartReplication(&cmd);
@@ -535,8 +545,8 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 
-	sprintf(conninfo, "host=%s port=%s dbname=replication",
-			walkeeper[leader].host, walkeeper[leader].port);
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
+			walkeeper[leader].host, walkeeper[leader].port, zenith_timeline_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{

From b9e24d12e67bba52f3f0e1b01fd0b87141dc6df9 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Tue, 17 Aug 2021 20:12:31 +0300
Subject: [PATCH 020/167] [walproposer] Skip absent WAL segment removed by
 pg_resetwal

---
 src/backend/replication/walproposer.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d71061765b8..857cef5deb7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -313,6 +313,16 @@ WalProposerMain(Datum main_arg)
 		WalProposerPoll();
 }
 
+static bool
+WalSegmentExists(XLogRecPtr startpos)
+{
+	char path[MAXPGPATH];
+	XLogSegNo segNo;
+	XLByteToSeg(startpos, segNo, serverInfo.walSegSize);
+	XLogFilePath(path, serverInfo.timeline, segNo, serverInfo.walSegSize);
+	return access(path, F_OK) == 0;
+}
+
 static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
@@ -322,6 +332,14 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	 */
 	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
 
+	/* Requested segment may not exists because we generate new segment at node startup (aka pg_resetwal).
+	 * So just skip it.
+	 */
+	if (!WalSegmentExists(startpos) && WalSegmentExists(startpos + serverInfo.walSegSize))
+	{
+		elog(LOG, "Advance start position %llx to next segment", startpos);
+		startpos += serverInfo.walSegSize;
+	}
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;

From 232d87608e23a44b15a30f37cf845ec76cd6000d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 11 Jun 2021 16:39:25 +0300
Subject: [PATCH 021/167] [walproposer] Fix breaking out of WalProposerPoll and
 WaitEventSetWait inside.

WAL proposer (as bgw without BGWORKER_BACKEND_DATABASE_CONNECTION) previously
ignored SetLatch, so once caught up it stuck inside WalProposerPoll infinitely.

Futher, WaitEventSetWait didn't have timeout, so we didn't try to reconnect if
all connections are dead as well. Fix that.

Also move break on latch set to the end of the loop to attempt
ReconnectWalKeepers even if latch is constantly set.

Per test_race_conditions (Python version now).
---
 src/backend/replication/walproposer.c | 50 +++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 857cef5deb7..410c0cab579 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -233,6 +233,7 @@ WalProposerMain(Datum main_arg)
 	char* port;
 
 	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
 
@@ -535,13 +536,33 @@ StartElection(void)
 	prop.epoch += 1;
 }
 
+/*
+ * How much milliseconds left till we should attempt reconnection to
+ * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
+ * (do we actually need this?).
+ */
+static long
+TimeToReconnect(TimestampTz now)
+{
+	TimestampTz passed;
+	TimestampTz till_reconnect;
+
+	if (wal_acceptor_reconnect_timeout <= 0)
+		return -1;
+
+	passed = now - last_reconnect_attempt;
+	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
+	if (till_reconnect <= 0)
+		return 0;
+	return (long) (till_reconnect / 1000);
+}
 
 static void
 ReconnectWalKeepers(void)
 {
 	/* Initiate reconnect if timeout is expired */
 	TimestampTz now = GetCurrentTimestamp();
-	if (wal_acceptor_reconnect_timeout > 0 && now - last_reconnect_attempt > wal_acceptor_reconnect_timeout*1000)
+	if (TimeToReconnect(now) == 0)
 	{
 		last_reconnect_attempt = now;
 		for (int i = 0; i < n_walkeepers; i++)
@@ -633,23 +654,19 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	return true;
 }
 
+/* Advance the WAL proposer state machine. */
 void
 WalProposerPoll(void)
 {
 	while (true)
 	{
 		WaitEvent	event;
-		int rc = WaitEventSetWait(waitEvents, -1, &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		WalKeeper*  wk = (WalKeeper*)event.user_data;
+		TimestampTz now = GetCurrentTimestamp();
+		int rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+								  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		WalKeeper*  wk = (WalKeeper*) event.user_data;
 		int i = (int)(wk - walkeeper);
 
-		/* If wait is terminated by error, postmaster die or latch event, then exit loop */
-		if (rc <= 0 || (event.events & (WL_POSTMASTER_DEATH|WL_LATCH_SET)) != 0)
-		{
-			ResetLatch(MyLatch);
-			break;
-		}
-
 		/* communication with walkeepers */
 		if (event.events & WL_SOCKET_READABLE)
 		{
@@ -869,7 +886,20 @@ WalProposerPoll(void)
 					elog(FATAL, "Unexpected write state %d", wk->state);
 			}
 		}
+
 		ReconnectWalKeepers();
+
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (event.events & (WL_LATCH_SET) != 0)
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+
 	}
 }
 

From 45e907432fe2752403c88d1a9521713847351863 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Jun 2021 18:03:41 +0300
Subject: [PATCH 022/167] [walproposer] Make it possible to start postgres
 without reading checkpoint from WAL

+ Check for presence of zenith.signal file to allow skip reading checkpoint record from WAL

+ Pass prev_record_ptr through zenith.signal file to postgres
---
 src/backend/access/transam/xlog.c     | 50 +++++++++++++++++++++++----
 src/backend/replication/walproposer.c | 18 ----------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 83b6e8c7084..c0e5593eba6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6495,6 +6495,7 @@ StartupXLOG(void)
 	bool		reachedRecoveryTarget = false;
 	bool		haveBackupLabel = false;
 	bool		haveTblspcMap = false;
+	bool        skipLastRecordReread = false;
 	XLogRecPtr	RecPtr,
 				checkPointLoc,
 				EndOfLog;
@@ -7039,10 +7040,26 @@ StartupXLOG(void)
 
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 	doPageWrites = lastFullPageWrites;
-
 	if (RecPtr < checkPoint.redo)
-		ereport(PANIC,
-				(errmsg("invalid redo in checkpoint record")));
+	{
+		int fd = BasicOpenFile("zenith.signal", O_RDWR | PG_BINARY);
+		if (fd >= 0) {
+			XLogRecPtr prevRecPtr = 0;
+			if ((size_t)read(fd, &prevRecPtr, sizeof prevRecPtr) != sizeof(prevRecPtr)) {
+				elog(LOG, "can't read previous record position from zenith.signal file: %m");
+			}
+			LastRec = prevRecPtr;
+			/* Zenith hacks to spawn compute node without WAL */
+			EndRecPtr = RecPtr = checkPoint.redo;
+			skipLastRecordReread = true;
+			close(fd);
+		}
+		else
+		{
+			ereport(PANIC,
+					(errmsg("invalid redo in checkpoint record")));
+		}
+	}
 
 	/*
 	 * Check whether we need to force recovery from WAL.  If it appears to
@@ -7708,8 +7725,28 @@ StartupXLOG(void)
 	 * valid or last applied record, so we can identify the exact endpoint of
 	 * what we consider the valid portion of WAL.
 	 */
-	XLogBeginRead(xlogreader, LastRec);
-	record = ReadRecord(xlogreader, PANIC, false);
+
+	/*
+	 * We use the last WAL page to initialize the WAL for writing,
+	 * so we better have it in memory.
+	 */
+	if (skipLastRecordReread)
+	{
+		XLogRecPtr lastPage = EndRecPtr - (EndRecPtr % XLOG_BLCKSZ);
+		int idx = XLogRecPtrToBufIdx(lastPage);
+		XLogPageHeader xlogPageHdr = (XLogPageHeader)(XLogCtl->pages + idx*XLOG_BLCKSZ);
+		xlogPageHdr->xlp_pageaddr = lastPage;
+		xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+		readOff = XLogSegmentOffset(lastPage, wal_segment_size);
+		elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
+	}
+	else
+	{
+		XLogBeginRead(xlogreader, LastRec);
+		record = ReadRecord(xlogreader, PANIC, false);
+		if (!record)
+			elog(PANIC, "could not re-read last record");
+	}
 	EndOfLog = EndRecPtr;
 
 	/*
@@ -7896,7 +7933,8 @@ StartupXLOG(void)
 		/* Copy the valid part of the last block, and zero the rest */
 		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
 		len = EndOfLog % XLOG_BLCKSZ;
-		memcpy(page, xlogreader->readBuf, len);
+		if (!skipLastRecordReread)
+			memcpy(page, xlogreader->readBuf, len);
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
 		XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 410c0cab579..cbc68173ef5 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -314,16 +314,6 @@ WalProposerMain(Datum main_arg)
 		WalProposerPoll();
 }
 
-static bool
-WalSegmentExists(XLogRecPtr startpos)
-{
-	char path[MAXPGPATH];
-	XLogSegNo segNo;
-	XLByteToSeg(startpos, segNo, serverInfo.walSegSize);
-	XLogFilePath(path, serverInfo.timeline, segNo, serverInfo.walSegSize);
-	return access(path, F_OK) == 0;
-}
-
 static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
@@ -333,14 +323,6 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	 */
 	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
 
-	/* Requested segment may not exists because we generate new segment at node startup (aka pg_resetwal).
-	 * So just skip it.
-	 */
-	if (!WalSegmentExists(startpos) && WalSegmentExists(startpos + serverInfo.walSegSize))
-	{
-		elog(LOG, "Advance start position %llx to next segment", startpos);
-		startpos += serverInfo.walSegSize;
-	}
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;

From 508520585dfeba27ef82eed944cbc2291f30630c Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 5 Jul 2021 17:30:26 +0300
Subject: [PATCH 023/167] [walproposer] Simplify WL_LATCH_SET testing in the
 walproposer

---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index cbc68173ef5..e87b36287ce 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -876,7 +876,7 @@ WalProposerPoll(void)
 		 * each wal flush), then exit loop. (no need for pm death check due to
 		 * WL_EXIT_ON_PM_DEATH)
 		 */
-		if (event.events & (WL_LATCH_SET) != 0)
+		if (event.events & WL_LATCH_SET)
 		{
 			ResetLatch(MyLatch);
 			break;

From 1f5c11d30908abcace4d274a3013b50cae14e48d Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Fri, 25 Jun 2021 00:12:18 +0300
Subject: [PATCH 024/167] [walredo] Add basic support for Seccomp BPF mode

This patch aims to make our bespoke WAL redo machinery more robust
in the presence of untrusted (in other words, possibly malicious) inputs.

Pageserver delegates complex WAL decoding duties to postgres,
which means that the latter might fall victim to carefully designed
malicious WAL records and start doing harmful things to the system.
To prevent this, it has been decided to limit possible interactions
with the outside world using the Secure Computing BPF mode.

We use this mode to disable all syscalls not in the allowlist.
Please refer to src/backend/postmaster/seccomp.c to learn more
about the pros & cons of the current approach.

+ Fix some bugs in seccomp bpf wrapper

* Use SCMP_ACT_TRAP instead of SCMP_ACT_KILL_PROCESS to receive signals.
* Add a missing variant of select() syscall (thx to @knizhnik).
* Write error messages to an fd stderr's currently pointing to.
---
 configure                          |  86 +++++++++++
 configure.ac                       |  13 ++
 src/Makefile.global.in             |   1 +
 src/backend/postmaster/Makefile    |   5 +
 src/backend/postmaster/seccomp.c   | 236 +++++++++++++++++++++++++++++
 src/backend/tcop/zenith_wal_redo.c |  77 +++++++++-
 src/include/pg_config.h.in         |   3 +
 src/include/postmaster/seccomp.h   |  26 ++++
 8 files changed, 443 insertions(+), 4 deletions(-)
 create mode 100644 src/backend/postmaster/seccomp.c
 create mode 100644 src/include/postmaster/seccomp.h

diff --git a/configure b/configure
index 0353624a4eb..27ac5f306c5 100755
--- a/configure
+++ b/configure
@@ -712,6 +712,7 @@ with_libxml
 with_uuid
 with_readline
 with_systemd
+with_libseccomp
 with_selinux
 with_ldap
 with_krb_srvnam
@@ -858,6 +859,7 @@ with_bsd_auth
 with_ldap
 with_bonjour
 with_selinux
+with_libseccomp
 with_systemd
 with_readline
 with_libedit_preferred
@@ -1564,6 +1566,7 @@ Optional Packages:
   --with-ldap             build with LDAP support
   --with-bonjour          build with Bonjour support
   --with-selinux          build with SELinux support
+  --with-libseccomp       build with libseccomp support
   --with-systemd          build with systemd support
   --without-readline      do not use GNU Readline nor BSD Libedit for editing
   --with-libedit-preferred
@@ -8428,6 +8431,39 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5
 $as_echo "$with_selinux" >&6; }
 
+#
+# libseccomp
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5
+$as_echo_n "checking whether to build with libseccomp support... " >&6; }
+
+
+
+# Check whether --with-libseccomp was given.
+if test "${with_libseccomp+set}" = set; then :
+  withval=$with_libseccomp;
+  case $withval in
+    yes)
+      :
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  with_libseccomp=no
+
+fi
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5
+$as_echo "$with_libseccomp" >&6; }
+
 #
 # Systemd
 #
@@ -14054,6 +14090,56 @@ else
 fi
 
 
+fi
+
+if test "$with_libseccomp" = yes ; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5
+$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; }
+if ${ac_cv_lib_seccomp_seccomp_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lseccomp  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char seccomp_init ();
+int
+main ()
+{
+return seccomp_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_seccomp_seccomp_init=yes
+else
+  ac_cv_lib_seccomp_seccomp_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5
+$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; }
+if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBSECCOMP 1
+_ACEOF
+
+  LIBS="-lseccomp $LIBS"
+
+else
+  as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5
+fi
+
 fi
 
 # for contrib/uuid-ossp
diff --git a/configure.ac b/configure.ac
index 6a7bb848c4f..7084477c2e7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -899,6 +899,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support])
 AC_SUBST(with_selinux)
 AC_MSG_RESULT([$with_selinux])
 
+#
+# libseccomp
+#
+AC_MSG_CHECKING([whether to build with libseccomp support])
+PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support])
+AC_SUBST(with_libseccomp)
+AC_MSG_RESULT([$with_libseccomp])
+
 #
 # Systemd
 #
@@ -1538,6 +1546,11 @@ dnl If you want to use Apple's own Bonjour code on another platform,
 dnl just add -ldns_sd to LIBS manually.
 fi
 
+if test "$with_libseccomp" = yes ; then
+  AC_CHECK_LIB(seccomp, seccomp_init, [],
+               [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])])
+fi
+
 # for contrib/uuid-ossp
 if test "$with_uuid" = bsd ; then
   AC_CHECK_HEADERS(uuid.h,
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 3b65f716cd2..3a2ae66989e 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -186,6 +186,7 @@ with_tcl	= @with_tcl@
 with_ssl	= @with_ssl@
 with_readline	= @with_readline@
 with_selinux	= @with_selinux@
+with_libseccomp = @with_libseccomp@
 with_systemd	= @with_systemd@
 with_gssapi	= @with_gssapi@
 with_krb_srvnam	= @with_krb_srvnam@
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index bfdf6a833db..926ee077111 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -26,4 +26,9 @@ OBJS = \
 	syslogger.o \
 	walwriter.o
 
+ifeq ($(with_libseccomp),yes)
+OBJS += \
+	seccomp.o
+endif
+
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
new file mode 100644
index 00000000000..4ff34ebbd66
--- /dev/null
+++ b/src/backend/postmaster/seccomp.c
@@ -0,0 +1,236 @@
+/*-------------------------------------------------------------------------
+ *
+ * seccomp.c
+ *	  Secure Computing BPF API wrapper.
+ *
+ * Pageserver delegates complex WAL decoding duties to postgres,
+ * which means that the latter might fall victim to carefully designed
+ * malicious WAL records and start doing harmful things to the system.
+ * To prevent this, it has been decided to limit possible interactions
+ * with the outside world using the Secure Computing BPF mode.
+ *
+ * We use this mode to disable all syscalls not in the allowlist. This
+ * approach has its pros & cons:
+ *
+ *  - We have to carefully handpick and maintain the set of syscalls
+ *    required for the WAL redo process. Core dumps help with that.
+ *    The method of trial and error seems to work reasonably well,
+ *    but it would be nice to find a proper way to "prove" that
+ *    the set in question is both necessary and sufficient.
+ *
+ *  - Once we enter the seccomp bpf mode, it's impossible to lift those
+ *    restrictions (otherwise, what kind of "protection" would that be?).
+ *    Thus, we have to either enable extra syscalls for the clean shutdown,
+ *    or exit the process immediately via _exit() instead of proc_exit().
+ *
+ *  - Should we simply use SCMP_ACT_KILL_PROCESS, or implement a custom
+ *    facility to deal with the forbidden syscalls? If we'd like to embed
+ *    a startup security test, we should go with the latter; In that
+ *    case, which one of the following options is preferable?
+ *
+ *      * Catch the denied syscalls with a signal handler using SCMP_ACT_TRAP.
+ *        Provide a common signal handler with a static switch to override
+ *        its behavior for the test case. This would undermine the whole
+ *        purpose of such protection, so we'd have to go further and remap
+ *        the memory backing the switch as readonly, then ban mprotect().
+ *        Ugly and fragile, to say the least.
+ *
+ *      * Yet again, catch the denied syscalls using SCMP_ACT_TRAP.
+ *        Provide 2 different signal handlers: one for a test case,
+ *        another for the main processing loop. Install the first one,
+ *        enable seccomp, perform the test, switch to the second one,
+ *        finally ban sigaction(), presto!
+ *
+ *      * Spoof the result of a syscall using SECCOMP_RET_ERRNO for the
+ *        test, then ban it altogether with another filter. The downside
+ *        of this solution is that we don't actually check that
+ *        SCMP_ACT_KILL_PROCESS/SCMP_ACT_TRAP works.
+ *
+ *    Either approach seems to require two eBPF filter programs,
+ *    which is unfortunate: the man page tells this is uncommon.
+ *    Maybe I (@funbringer) am missing something, though; I encourage
+ *    any reader to get familiar with it and scrutinize my conclusions.
+ *
+ * TODOs and ideas in no particular order:
+ *
+ *  - Do something about mmap() in musl's malloc().
+ *    Definitely not a priority if we don't care about musl.
+ *
+ *  - See if we can untangle PG's shutdown sequence (involving unlink()):
+ *
+ *      * Simplify (or rather get rid of) shmem setup in PG's WAL redo mode.
+ *      * Investigate chroot() or mount namespaces for better FS isolation.
+ *      * (Per Heikki) Simply call _exit(), no big deal.
+ *      * Come up with a better idea?
+ *
+ *  - Make use of seccomp's argument inspection (for what?).
+ *    Unfortunately, it views all syscall arguments as scalars,
+ *    so it won't work for e.g. string comparison in unlink().
+ *
+ *  - Benchmark with bpf jit on/off, try seccomp_syscall_priority().
+ *
+ *  - Test against various linux distros & glibc versions.
+ *    I suspect that certain libc functions might involve slightly
+ *    different syscalls, e.g. select/pselect6/pselect6_time64/whatever.
+ *
+ *  - Test on any arch other than amd64 to see if it works there.
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/seccomp.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+#include "postmaster/seccomp.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+static void die(int code, const char *str);
+
+static bool seccomp_test_sighandler_done = false;
+static void seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt);
+static void seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt);
+
+static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action);
+
+void seccomp_load_rules(PgSeccompRule *rules, int count)
+{
+#define raise_error(str) \
+	ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: " str)))
+
+	struct sigaction action = { .sa_flags = SA_SIGINFO };
+	PgSeccompRule rule;
+	long fd;
+
+	/*
+	 * Install a test signal handler.
+	 * XXX: pqsignal() is too restrictive for our purposes,
+	 * since we'd like to examine the contents of siginfo_t.
+	 */
+	action.sa_sigaction = seccomp_test_sighandler;
+	if (sigaction(SIGSYS, &action, NULL) != 0)
+		raise_error("failed to install a test SIGSYS handler");
+
+	/*
+	 * First, check that open of a well-known file works.
+	 * XXX: We use raw syscall() to call the very open().
+	 */
+	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	if (fd < 0 || seccomp_test_sighandler_done)
+		raise_error("failed to open a test file");
+	close((int)fd);
+
+	/* Set a trap on open() to test seccomp bpf */
+	rule = PG_SCMP(open, SCMP_ACT_TRAP);
+	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
+		raise_error("failed to load a test filter");
+
+	/* Finally, check that open() now raises SIGSYS */
+	(void)syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	if (!seccomp_test_sighandler_done)
+		raise_error("SIGSYS handler doesn't seem to work");
+
+	/* Now that everything seems to work, install a proper handler */
+	action.sa_sigaction = seccomp_deny_sighandler;
+	if (sigaction(SIGSYS, &action, NULL) != 0)
+		raise_error("failed to install a proper SIGSYS handler");
+
+	/* If this succeeds, any syscall not in the list will crash the process */
+	if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
+		raise_error("failed to enter seccomp mode");
+
+#undef raise_error
+}
+
+/*
+ * Enter seccomp mode with a BPF filter that will only allow
+ * certain syscalls to proceed.
+ */
+static int
+do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
+{
+	scmp_filter_ctx ctx;
+	int rc = -1;
+
+	/* Create a context with a default action for syscalls not in the list */
+	if ((ctx = seccomp_init(def_action)) == NULL)
+		goto cleanup;
+
+	for (int i = 0; i < count; i++)
+	{
+		PgSeccompRule *rule = &rules[i];
+		if ((rc = seccomp_rule_add(ctx, rule->psr_action, rule->psr_syscall, 0)) != 0)
+			goto cleanup;
+	}
+
+	/* Try building & loading the program into the kernel */
+	if ((rc = seccomp_load(ctx)) != 0)
+		goto cleanup;
+
+cleanup:
+	/*
+	 * We don't need the context anymore regardless of the result,
+	 * since either we failed or the eBPF program has already been
+	 * loaded into the linux kernel.
+	 */
+	seccomp_release(ctx);
+	return rc;
+}
+
+static void
+die(int code, const char *str)
+{
+	/* Best effort write to stderr */
+	(void)write(fileno(stderr), str, strlen(str));
+
+	/* XXX: we don't want to run any atexit callbacks */
+	_exit(code);
+}
+
+static void
+seccomp_test_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
+{
+#define DIE_PREFIX "seccomp test signal handler: "
+
+	/* Check that this signal handler is used only for a single test case */
+	if (seccomp_test_sighandler_done)
+		die(1, DIE_PREFIX "test handler should only be used for 1 test\n");
+	seccomp_test_sighandler_done = true;
+
+	if (signum != SIGSYS)
+		die(1, DIE_PREFIX "bad signal number\n");
+
+	/* TODO: maybe somehow extract the hardcoded syscall number */
+	if (info->si_syscall != SCMP_SYS(open))
+		die(1, DIE_PREFIX "bad syscall number\n");
+
+#undef DIE_PREFIX
+}
+
+static void
+seccomp_deny_sighandler(int signum, siginfo_t *info, void *cxt pg_attribute_unused())
+{
+	/*
+	 * Unfortunately, we can't use seccomp_syscall_resolve_num_arch()
+	 * to resolve the syscall's name, since it calls strdup()
+	 * under the hood (wtf!).
+	 */
+	char buffer[128];
+	(void)snprintf(buffer, lengthof(buffer),
+			"---------------------------------------\n"
+			"seccomp: bad syscall %d\n"
+			"---------------------------------------\n",
+			info->si_syscall);
+
+	/*
+	 * Instead of silently crashing the process with
+	 * a fake SIGSYS caused by SCMP_ACT_KILL_PROCESS,
+	 * we'd like to receive a real SIGSYS to print the
+	 * message and *then* immediately exit.
+	 */
+	die(1, buffer);
+}
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 4503648fc3e..7e00a9e985d 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -62,6 +62,11 @@
 #include <sys/resource.h>
 #endif
 
+#if defined(HAVE_LIBSECCOMP) && defined(__GLIBC__)
+#define MALLOC_NO_MMAP
+#include <malloc.h>
+#endif
+
 #ifndef HAVE_GETRUSAGE
 #include "rusagestub.h"
 #endif
@@ -73,9 +78,10 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "postmaster/postmaster.h"
-#include "storage/ipc.h"
-#include "storage/bufmgr.h"
+#include "postmaster/seccomp.h"
 #include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"
@@ -93,6 +99,44 @@ static BufferTag target_redo_tag;
 
 #define TRACE DEBUG5
 
+#ifdef HAVE_LIBSECCOMP
+static void
+enter_seccomp_mode(void)
+{
+	PgSeccompRule syscalls[] =
+	{
+		/* Hard requirements */
+		PG_SCMP_ALLOW(exit_group),
+		PG_SCMP_ALLOW(pselect6),
+		PG_SCMP_ALLOW(read),
+		PG_SCMP_ALLOW(select),
+		PG_SCMP_ALLOW(write),
+
+		/* Memory allocation */
+		PG_SCMP_ALLOW(brk),
+#ifndef MALLOC_NO_MMAP
+		/* TODO: musl doesn't have mallopt */
+		PG_SCMP_ALLOW(mmap),
+		PG_SCMP_ALLOW(munmap),
+#endif
+
+		/* Enable those for a proper shutdown.
+		PG_SCMP_ALLOW(munmap),
+		PG_SCMP_ALLOW(shmctl),
+		PG_SCMP_ALLOW(shmdt),
+		PG_SCMP_ALLOW(unlink), // shm_unlink
+		*/
+	};
+
+#ifdef MALLOC_NO_MMAP
+	/* Ask glibc not to use mmap() */
+	mallopt(M_MMAP_MAX, 0);
+#endif
+
+	seccomp_load_rules(syscalls, lengthof(syscalls));
+}
+#endif
+
 /* ----------------------------------------------------------------
  * FIXME comment
  * PostgresMain
@@ -245,6 +289,22 @@ WalRedoMain(int argc, char *argv[],
 			RmgrTable[rmid].rm_startup();
 	}
 
+#ifdef HAVE_LIBSECCOMP
+	/* We prefer opt-out to opt-in for greater security */
+	bool enable_seccomp = true;
+	for (int i = 1; i < argc; i++)
+		if (strcmp(argv[i], "--disable-seccomp") == 0)
+			enable_seccomp = false;
+
+	/*
+	 * We deliberately delay the transition to the seccomp mode
+	 * until it's time to enter the main processing loop;
+	 * else we'd have to add a lot more syscalls to the allowlist.
+	 */
+	if (enable_seccomp)
+		enter_seccomp_mode();
+#endif
+
 	/*
 	 * Main processing loop
 	 */
@@ -289,6 +349,16 @@ WalRedoMain(int argc, char *argv[],
 				 */
 			case EOF:
 
+#ifdef HAVE_LIBSECCOMP
+				/*
+				 * Skip the shutdown sequence, leaving some garbage behind.
+				 * Hopefully, postgres will clean it up in the next run.
+				 * This way we don't have to enable extra syscalls, which is nice.
+				 * See enter_seccomp_mode() above.
+				 */
+				if (enable_seccomp)
+					_exit(0);
+#endif
 				/*
 				 * NOTE: if you are tempted to add more code here, DON'T!
 				 * Whatever you had in mind to do should be set up as an
@@ -636,8 +706,7 @@ GetPage(StringInfo input_message)
 	/* single thread, so don't bother locking the page */
 
 	/* Response: Page content */
-	fwrite(page, 1, BLCKSZ, stdout); /* FIXME: check errors */
-	fflush(stdout);
+	write(STDOUT_FILENO, page, BLCKSZ); /* FIXME: check errors */
 
 	ReleaseBuffer(buf);
 	DropDatabaseBuffers(rnode.dbNode);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d69d461ff2c..d35fc9c94e4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -342,6 +342,9 @@
 /* Define if you have a function readline library */
 #undef HAVE_LIBREADLINE
 
+/* Define to 1 if you have the `seccomp' library (-lseccomp). */
+#undef HAVE_LIBSECCOMP
+
 /* Define to 1 if you have the `selinux' library (-lselinux). */
 #undef HAVE_LIBSELINUX
 
diff --git a/src/include/postmaster/seccomp.h b/src/include/postmaster/seccomp.h
new file mode 100644
index 00000000000..1613d34bd47
--- /dev/null
+++ b/src/include/postmaster/seccomp.h
@@ -0,0 +1,26 @@
+#ifndef PG_SECCOMP_H
+#define PG_SECCOMP_H
+
+#include "postgres.h"
+
+#ifdef HAVE_LIBSECCOMP
+#include <seccomp.h>
+#endif
+
+typedef struct {
+    int    psr_syscall; /* syscall number */
+    uint32 psr_action;  /* libseccomp action, e.g. SCMP_ACT_ALLOW */
+} PgSeccompRule;
+
+#define PG_SCMP(syscall, action)                \
+    (PgSeccompRule) {                           \
+        .psr_syscall = SCMP_SYS(syscall),       \
+        .psr_action = (action),                 \
+    }
+
+#define PG_SCMP_ALLOW(syscall) \
+    PG_SCMP(syscall, SCMP_ACT_ALLOW)
+
+void seccomp_load_rules(PgSeccompRule *syscalls, int count);
+
+#endif /* PG_SECCOMP_H */

From fb8c2a1a8cfd9dc2d0269e00cdac6436942c377b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 9 Jul 2021 11:56:20 +0300
Subject: [PATCH 025/167] [smgr_api] [contrib/zenith] 1. Do not call mdinit
 from smgrinit() because it cause memory leak in wal-redo-postgres 2. Add
 check for local relations to make it possible to use DEBUG_COMPARE_LOCAL mode
 in SMGR

+ Call smgr_init_standard from smgr_init_zenith
---
 contrib/zenith/pagestore_smgr.c | 29 ++++++++++++++++++++---------
 src/backend/storage/smgr/smgr.c |  7 ++-----
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 19e39ffeb74..29c5c46c0e3 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -44,6 +44,8 @@
 static char *hexdump_page(char *page);
 #endif
 
+#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
+
 const int SmgrTrace = DEBUG5;
 
 bool loaded = false;
@@ -492,7 +494,8 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 forkNum);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdcreate(reln, forkNum, isRedo);
+	if (IS_LOCAL_REL(reln))
+		mdcreate(reln, forkNum, isRedo);
 #endif
 }
 
@@ -548,7 +551,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdextend(reln, forkNum, blkno, buffer, skipFsync);
+	if (IS_LOCAL_REL(reln))
+		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
 }
 
@@ -562,7 +566,8 @@ zenith_open(SMgrRelation reln)
 	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdopen(reln);
+	if (IS_LOCAL_REL(reln))
+		mdopen(reln);
 #endif
 }
 
@@ -576,7 +581,8 @@ zenith_close(SMgrRelation reln, ForkNumber forknum)
 	elog(SmgrTrace, "[ZENITH_SMGR] close noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdclose(reln, forknum);
+	if (IS_LOCAL_REL(reln))
+		mdclose(reln, forknum);
 #endif
 }
 
@@ -605,7 +611,8 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdwriteback(reln, forknum, blocknum, nblocks);
+	if (IS_LOCAL_REL(reln))
+		mdwriteback(reln, forknum, blocknum, nblocks);
 #endif
 }
 
@@ -647,7 +654,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 
 
 #ifdef DEBUG_COMPARE_LOCAL
-	if (forkNum == MAIN_FORKNUM)
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
 		char pageserver_masked[BLCKSZ];
 		char mdbuf[BLCKSZ];
@@ -828,7 +835,8 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+	if (IS_LOCAL_REL(reln))
+		mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 #endif
 }
 
@@ -894,7 +902,8 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	SetLastWrittenPageLSN(lsn);
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdtruncate(reln, forknum, nblocks);
+	if (IS_LOCAL_REL(reln))
+		mdtruncate(reln, forknum, nblocks);
 #endif
 }
 
@@ -915,7 +924,8 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
 	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
-	mdimmedsync(reln, forknum);
+	if (IS_LOCAL_REL(reln))
+		mdimmedsync(reln, forknum);
 #endif
 }
 
@@ -953,5 +963,6 @@ smgr_zenith(BackendId backend, RelFileNode rnode)
 void
 smgr_init_zenith(void)
 {
+	smgr_init_standard();
 	zenith_init();
 }
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index b455d07edce..8d2b6b73b29 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -69,10 +69,7 @@ static dlist_head unowned_relns;
 void
 smgrinit(void)
 {
-	if (smgr_init_hook)
-		(*smgr_init_hook)();
-
-	smgr_init_standard();
+	(*smgr_init_hook)();
 
 	/*
 	 * ZENITH XXX
@@ -98,7 +95,7 @@ smgrinit(void)
 
 /* Hook for plugins to get control in smgr */
 smgr_hook_type smgr_hook = NULL;
-smgr_init_hook_type smgr_init_hook = NULL;
+smgr_init_hook_type smgr_init_hook = smgr_init_standard;
 smgr_shutdown_hook_type smgr_shutdown_hook = NULL;
 
 const f_smgr *

From 15becb666ef646dcd59245ef012a487f046f6468 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Thu, 15 Jul 2021 15:50:44 +0300
Subject: [PATCH 026/167] [walproposer] [contrib/zenith] support zenith_tenant

this patch adds support for zenith_tenant variable. it has similar
format as zenith_timeline. It is used in callmemaybe query to pass
tenant to pageserver and in ServerInfo structure passed to wal acceptor
---
 contrib/zenith/libpagestore.c         | 26 +++++++++++++++++++-------
 contrib/zenith/pagestore_client.h     |  1 +
 contrib/zenith/pagestore_smgr.c       |  1 +
 src/backend/replication/walproposer.c |  8 ++++++++
 src/include/replication/walproposer.h |  2 ++
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 062f0cbf2e0..89bfbe1906c 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -67,11 +67,13 @@ zenith_connect()
 	}
 
 	/* Ask the Page Server to connect to us, and stream WAL from us. */
-	if (callmemaybe_connstring && callmemaybe_connstring[0])
+	if (callmemaybe_connstring && callmemaybe_connstring[0] 
+		&& zenith_tenant
+		&& zenith_timeline)
 	{
 		PGresult   *res;
 
-		query = psprintf("callmemaybe %s %s", zenith_timeline, callmemaybe_connstring);
+		query = psprintf("callmemaybe %s %s %s", zenith_tenant, zenith_timeline, callmemaybe_connstring);
 		res = PQexec(pageserver_conn, query);
 		if (PQresultStatus(res) != PGRES_COMMAND_OK)
 		{
@@ -81,7 +83,7 @@ zenith_connect()
 		PQclear(res);
 	}
 
-	query = psprintf("pagestream %s", zenith_timeline);
+	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
 	ret = PQsendQuery(pageserver_conn, query);
 	if (ret != 1)
 		zenith_log(ERROR,
@@ -185,11 +187,11 @@ zenith_call(ZenithRequest request)
 
 
 static bool
-check_zenith_timeline(char **newval, void **extra, GucSource source)
+check_zenith_id(char **newval, void **extra, GucSource source)
 {
-	uint8		ztimelineid[16];
+	uint8		zid[16];
 
-	return **newval == '\0' || HexDecodeString(ztimelineid, *newval, 16);
+	return **newval == '\0' || HexDecodeString(zid, *newval, 16);
 }
 
 /*
@@ -223,7 +225,16 @@ _PG_init(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   check_zenith_timeline, NULL, NULL);
+							   check_zenith_id, NULL, NULL);
+
+	DefineCustomStringVariable("zenith.zenith_tenant",
+							   "Zenith tenantid the server is running on",
+							   NULL,
+							   &zenith_tenant,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_zenith_id, NULL, NULL);
 
 	DefineCustomBoolVariable("zenith.wal_redo",
 							 "start in wal-redo mode",
@@ -242,6 +253,7 @@ _PG_init(void)
 
 	/* Is there more correct way to pass CustomGUC to postgres code? */
 	zenith_timeline_walproposer = zenith_timeline;
+	zenith_tenant_walproposer = zenith_tenant;
 
 	if (wal_redo)
 	{
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 400fb259a6b..b4b223d3c46 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -90,6 +90,7 @@ extern page_server_api * page_server;
 extern char *page_server_connstring;
 extern char *callmemaybe_connstring;
 extern char *zenith_timeline;
+extern char *zenith_tenant;
 extern bool wal_redo;
 
 extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 29c5c46c0e3..858a67841ea 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -56,6 +56,7 @@ page_server_api *page_server;
 char *page_server_connstring;
 char *callmemaybe_connstring;
 char *zenith_timeline;
+char *zenith_tenant;
 bool wal_redo = false;
 
 char const *const ZenithMessageStr[] =
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index e87b36287ce..7a7996a1f82 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -221,6 +221,7 @@ HandleWalKeeperResponse(void)
 }
 
 char *zenith_timeline_walproposer = NULL;
+char *zenith_tenant_walproposer = NULL;
 
 /*
  * WAL proposer bgworeker entry point
@@ -285,6 +286,13 @@ WalProposerMain(Datum main_arg)
 	if (*zenith_timeline_walproposer != '\0' &&
 	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+	
+	if (!zenith_tenant_walproposer)
+		elog(FATAL, "zenith.zenith_tenant is not provided");
+	if (*zenith_tenant_walproposer != '\0' &&
+	 !HexDecodeString(serverInfo.ztenantid, zenith_tenant_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+
 	serverInfo.protocolVersion = SK_PROTOCOL_VERSION;
 	pg_strong_random(&serverInfo.nodeId.uuid, sizeof(serverInfo.nodeId.uuid));
 	serverInfo.systemId = GetSystemIdentifier();
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index e1845e3fb19..d770473ad35 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -23,6 +23,7 @@ struct WalMessage;
 typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
+extern char *zenith_tenant_walproposer;
 
 /* WAL safekeeper state */
 typedef enum
@@ -59,6 +60,7 @@ typedef struct ServerInfo
 	XLogRecPtr walEnd;
     TimeLineID timeline;
 	int        walSegSize;
+	uint8      ztenantid[16];
 } ServerInfo;
 
 /*

From 87797bb6bdea98f9e677b68a2c13c1c1e2d203d9 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 4 Aug 2021 11:56:41 +0300
Subject: [PATCH 027/167] [walproposer] Remove graceful termination of COPY
 during walproposer recovery.

Rust's postgres_backend currently is too dummy to handle it properly: reading
happens in separate thread which just ignores CopyDone. Instead, writer thread
must get aware of termination and send CommandComplete. Also reading socket must
be transferred back to postgres_backend (or connection terminated completely
after COPY). Let's do that after more basic safkeeper refactoring and right now
cover this up to make tests pass.

ref #388
---
 src/backend/replication/walproposer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7a7996a1f82..240e3769a6a 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -612,7 +612,6 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 			if (rec_end_lsn >= endpos)
 				break;
 		}
-		walrcv_endstreaming(wrconn, &timeline);
 		walrcv_disconnect(wrconn);
 	}
 	else

From 4319a6f84724eed507cf8408fa8f195c34363ef3 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 9 Aug 2021 09:46:07 +0300
Subject: [PATCH 028/167] [walproposer] [contrib/zenith] [refer #395] Do no
 align sart replication position in wal_proppser to segment boundary

---
 contrib/zenith/pagestore_smgr.c       |   4 +
 src/backend/replication/walproposer.c | 359 +++++++++++++-------------
 2 files changed, 181 insertions(+), 182 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 858a67841ea..3d24cb79f5f 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -21,6 +21,7 @@
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 #include "access/xlogdefs.h"
+#include "postmaster/interrupt.h"
 #include "storage/bufmgr.h"
 #include "fmgr.h"
 #include "miscadmin.h"
@@ -243,6 +244,9 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 {
 	XLogRecPtr lsn = PageGetLSN(buffer);
 
+	if (ShutdownRequestPending)
+		return;
+
 	/*
 	 * If the page was not WAL-logged before eviction then we can lose its modification.
 	 * PD_WAL_LOGGED bit is used to mark pages which are wal-logged.
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 240e3769a6a..837b751a24c 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -326,11 +326,6 @@ static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
-	/*
-	 * Always start streaming at the beginning of a segment
-	 */
-	startpos -= XLogSegmentOffset(startpos, serverInfo.walSegSize);
-
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;
@@ -656,226 +651,227 @@ WalProposerPoll(void)
 		WalKeeper*  wk = (WalKeeper*) event.user_data;
 		int i = (int)(wk - walkeeper);
 
-		/* communication with walkeepers */
-		if (event.events & WL_SOCKET_READABLE)
+		if (rc != 0)
 		{
-			switch (wk->state)
+			/* communication with walkeepers */
+			if (event.events & WL_SOCKET_READABLE)
 			{
-				case SS_HANDSHAKE:
-					/* Receive walkeeper node state */
-					rc = ReadSocketAsync(wk->sock,
-										 (char*)&wk->info + wk->asyncOffs,
-										 sizeof(wk->info) - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == sizeof(wk->info))
-					{
-						/* WalKeeper response completely received */
-
-						/* Check protocol version */
-						if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
+				switch (wk->state)
+				{
+					case SS_HANDSHAKE:
+						/* Receive walkeeper node state */
+						rc = ReadSocketAsync(wk->sock,
+											 (char*)&wk->info + wk->asyncOffs,
+											 sizeof(wk->info) - wk->asyncOffs);
+						if (rc < 0)
 						{
-							elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
-								wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
 							ResetConnection(i);
 						}
-						else
+						else if ((wk->asyncOffs += rc) == sizeof(wk->info))
 						{
-							wk->state = SS_VOTING;
-							wk->feedback.flushLsn = restartLsn;
-							wk->feedback.hs.ts = 0;
+							/* WalKeeper response completely received */
 
-							/* Check if we have quorum */
-							if (++n_connected >= quorum)
+							/* Check protocol version */
+							if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
 							{
-								if (n_connected == quorum)
-									StartElection();
+								elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
+									 wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
+								ResetConnection(i);
+							}
+							else
+							{
+								wk->state = SS_VOTING;
+								wk->feedback.flushLsn = restartLsn;
+								wk->feedback.hs.ts = 0;
 
-								/* Now send max-node-id to everyone participating in voting and wait their responses */
-								for (int j = 0; j < n_walkeepers; j++)
+								/* Check if we have quorum */
+								if (++n_connected >= quorum)
 								{
-									if (walkeeper[j].state == SS_VOTING)
+									if (n_connected == quorum)
+										StartElection();
+
+									/* Now send max-node-id to everyone participating in voting and wait their responses */
+									for (int j = 0; j < n_walkeepers; j++)
 									{
-										if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
-										{
-											ResetConnection(j);
-										}
-										else
+										if (walkeeper[j].state == SS_VOTING)
 										{
-											walkeeper[j].asyncOffs = 0;
-											walkeeper[j].state = SS_WAIT_VERDICT;
+											if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
+											{
+												ResetConnection(j);
+											}
+											else
+											{
+												walkeeper[j].asyncOffs = 0;
+												walkeeper[j].state = SS_WAIT_VERDICT;
+											}
 										}
 									}
 								}
 							}
 						}
-					}
-					break;
-
-				case SS_WAIT_VERDICT:
-					/* Receive walkeeper response for our candidate */
-					rc = ReadSocketAsync(wk->sock,
-										 (char*)&wk->info.server.nodeId + wk->asyncOffs,
-										 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
-					{
-						/* Response completely received */
-
-						/* If server accept our candidate, then it returns it in response */
-						if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+						break;
+
+					case SS_WAIT_VERDICT:
+						/* Receive walkeeper response for our candidate */
+						rc = ReadSocketAsync(wk->sock,
+											 (char*)&wk->info.server.nodeId + wk->asyncOffs,
+											 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
+						if (rc < 0)
 						{
-							elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-								wk->host, wk->port,
-								wk->info.server.nodeId.term, prop.nodeId.term);
+							ResetConnection(i);
 						}
-						else
+						else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
 						{
-							/* Handshake completed, do we have quorum? */
-							wk->state = SS_IDLE;
-							if (++n_votes == quorum)
-							{
-								elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
-									 quorum,
-									 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
-									);
+							/* Response completely received */
 
-								/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-								if (restartLsn != prop.VCL)
-								{
-									/* Perform recovery */
-									if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
-										elog(FATAL, "Failed to recover state");
-								}
-								WalProposerStartStreaming(prop.VCL);
-								/* Should not return here */
+							/* If server accept our candidate, then it returns it in response */
+							if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+							{
+								elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+									 wk->host, wk->port,
+									 wk->info.server.nodeId.term, prop.nodeId.term);
 							}
 							else
 							{
-								/* We are already streaming WAL: send all pending messages to the attached walkeeper */
-								SendMessageToNode(i, msgQueueHead);
+								/* Handshake completed, do we have quorum? */
+								wk->state = SS_IDLE;
+								if (++n_votes == quorum)
+								{
+									elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
+										 quorum,
+										 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
+										);
+
+									/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+									if (restartLsn != prop.VCL)
+									{
+										/* Perform recovery */
+										if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+											elog(FATAL, "Failed to recover state");
+									}
+									WalProposerStartStreaming(prop.VCL);
+									/* Should not return here */
+								}
+								else
+								{
+									/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+									SendMessageToNode(i, msgQueueHead);
+								}
 							}
 						}
-					}
-					break;
-
-			    case SS_RECV_FEEDBACK:
-					/* Read walkeeper response with flushed WAL position */
-				    rc = ReadSocketAsync(wk->sock,
-										 (char*)&wk->feedback + wk->asyncOffs,
-										 sizeof(wk->feedback) - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
-					{
-						WalMessage* next = wk->currMsg->next;
-						Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
-						wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
-						wk->state = SS_IDLE;
-						wk->asyncOffs = 0;
-						wk->currMsg = NULL;
-						HandleWalKeeperResponse();
-						SendMessageToNode(i, next);
-
-						/*
-						 * Also send the new VCL to all the walkeepers.
-						 *
-						 * FIXME: This is redundant for walkeepers that have other outbound messages
-						 * pending.
-						 */
-						if (true)
+						break;
+
+					case SS_RECV_FEEDBACK:
+						/* Read walkeeper response with flushed WAL position */
+						rc = ReadSocketAsync(wk->sock,
+											 (char*)&wk->feedback + wk->asyncOffs,
+											 sizeof(wk->feedback) - wk->asyncOffs);
+						if (rc < 0)
 						{
-							XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-							WalMessage *vclUpdateMsg;
-
-							if (minQuorumLsn > lastSentVCLLsn)
+							ResetConnection(i);
+						}
+						else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
+						{
+							WalMessage* next = wk->currMsg->next;
+							Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
+							wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+							wk->state = SS_IDLE;
+							wk->asyncOffs = 0;
+							wk->currMsg = NULL;
+							HandleWalKeeperResponse();
+							SendMessageToNode(i, next);
+
+							/*
+							 * Also send the new VCL to all the walkeepers.
+							 *
+							 * FIXME: This is redundant for walkeepers that have other outbound messages
+							 * pending.
+							 */
+							if (true)
 							{
-								vclUpdateMsg = CreateMessageVCLOnly();
-								if (vclUpdateMsg)
-									BroadcastMessage(vclUpdateMsg);
-								lastSentVCLLsn = minQuorumLsn;
+								XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+								WalMessage *vclUpdateMsg;
+
+								if (minQuorumLsn > lastSentVCLLsn)
+								{
+									vclUpdateMsg = CreateMessageVCLOnly();
+									if (vclUpdateMsg)
+										BroadcastMessage(vclUpdateMsg);
+									lastSentVCLLsn = minQuorumLsn;
+								}
 							}
 						}
-					}
-					break;
-
-				case SS_IDLE:
-					elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
-					ResetConnection(i);
-					break;
+						break;
+					case SS_IDLE:
+						elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
+						ResetConnection(i);
+						break;
 
-				default:
-		  			elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+					default:
+						elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+				}
 			}
-		}
-		else if (event.events & WL_SOCKET_WRITEABLE)
-		{
-			switch (wk->state)
+			else if (event.events & WL_SOCKET_WRITEABLE)
 			{
-				case SS_CONNECTING:
+				switch (wk->state)
 				{
-					int			optval = 0;
-					ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
-					if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
+					case SS_CONNECTING:
 					{
-						elog(WARNING, "Failed to connect to node '%s:%s': %s",
-							 wk->host, wk->port,
-							 strerror(optval));
-						closesocket(wk->sock);
-						wk->sock =  PGINVALID_SOCKET;
-						wk->state = SS_OFFLINE;
-						ResetWalProposerEventSet();
-					}
-					else
-					{
-						uint32 len = 0;
-						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-						/*
-						 * Start handshake: send information about server.
-						 * First of all send 0 as package size: it allows walkeeper to distinguish
-						 * wal_proposer's connection from standard replication connection from pagers.
-						 */
-						if (WriteSocket(wk->sock, &len, sizeof len)
-							&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
+						int			optval = 0;
+						ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
+						if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
 						{
-							wk->state = SS_HANDSHAKE;
-							wk->asyncOffs = 0;
+							elog(WARNING, "Failed to connect to node '%s:%s': %s",
+								 wk->host, wk->port,
+								 strerror(optval));
+							closesocket(wk->sock);
+							wk->sock =  PGINVALID_SOCKET;
+							wk->state = SS_OFFLINE;
+							ResetWalProposerEventSet();
 						}
 						else
 						{
-							ResetConnection(i);
+							uint32 len = 0;
+							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+							/*
+							 * Start handshake: send information about server.
+							 * First of all send 0 as package size: it allows walkeeper to distinguish
+							 * wal_proposer's connection from standard replication connection from pagers.
+							 */
+							if (WriteSocket(wk->sock, &len, sizeof len)
+								&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
+							{
+								wk->state = SS_HANDSHAKE;
+								wk->asyncOffs = 0;
+							}
+							else
+							{
+								ResetConnection(i);
+							}
 						}
+						break;
 					}
-					break;
-				}
 
-				case SS_SEND_WAL:
-					rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
-					if (rc < 0)
-					{
-						ResetConnection(i);
-					}
-					else if ((wk->asyncOffs += rc) == wk->currMsg->size)
-					{
-						/* WAL block completely sent */
-						wk->state = SS_RECV_FEEDBACK;
-						wk->asyncOffs = 0;
-						ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-					}
-					break;
+					case SS_SEND_WAL:
+						rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
+						if (rc < 0)
+						{
+							ResetConnection(i);
+						}
+						else if ((wk->asyncOffs += rc) == wk->currMsg->size)
+						{
+							/* WAL block completely sent */
+							wk->state = SS_RECV_FEEDBACK;
+							wk->asyncOffs = 0;
+							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
+						}
+						break;
 
-				default:
-					elog(FATAL, "Unexpected write state %d", wk->state);
+					default:
+						elog(FATAL, "Unexpected write state %d", wk->state);
+				}
 			}
 		}
-
 		ReconnectWalKeepers();
 
 		/*
@@ -883,12 +879,11 @@ WalProposerPoll(void)
 		 * each wal flush), then exit loop. (no need for pm death check due to
 		 * WL_EXIT_ON_PM_DEATH)
 		 */
-		if (event.events & WL_LATCH_SET)
+		if (rc != 0 && (event.events & WL_LATCH_SET))
 		{
 			ResetLatch(MyLatch);
 			break;
 		}
-
 	}
 }
 

From 48fc26e1a9214708c787cbd021ddb94eccdb5aa2 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Tue, 10 Aug 2021 08:10:08 +0300
Subject: [PATCH 029/167] [test] Add contrib/zenith_test_utils with helpers for
 testing and debugging. Now it contains only one function test_consume_xids()
 for xid wraparound testing.

---
 contrib/zenith_test_utils/Makefile            | 22 ++++++++
 .../zenith_test_utils--1.0.sql                |  8 +++
 .../zenith_test_utils.control                 |  5 ++
 contrib/zenith_test_utils/zenithtest.c        | 50 +++++++++++++++++++
 4 files changed, 85 insertions(+)
 create mode 100644 contrib/zenith_test_utils/Makefile
 create mode 100644 contrib/zenith_test_utils/zenith_test_utils--1.0.sql
 create mode 100644 contrib/zenith_test_utils/zenith_test_utils.control
 create mode 100644 contrib/zenith_test_utils/zenithtest.c

diff --git a/contrib/zenith_test_utils/Makefile b/contrib/zenith_test_utils/Makefile
new file mode 100644
index 00000000000..9203f2349d3
--- /dev/null
+++ b/contrib/zenith_test_utils/Makefile
@@ -0,0 +1,22 @@
+# contrib/zenith_test_utils/Makefile
+
+
+MODULE_big = zenith_test_utils
+OBJS = \
+	$(WIN32RES) \
+	zenithtest.o
+
+EXTENSION = zenith_test_utils
+DATA = zenith_test_utils--1.0.sql
+PGFILEDESC = "zenith_test_utils - helpers for zenith testing and debugging"
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/zenith_test_utils
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
new file mode 100644
index 00000000000..6c8fe6521cf
--- /dev/null
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -0,0 +1,8 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION zenith_test_utils" to load this file. \quit
+
+CREATE FUNCTION test_consume_xids(nxids int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_xids'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenith_test_utils.control b/contrib/zenith_test_utils/zenith_test_utils.control
new file mode 100644
index 00000000000..9b947b63966
--- /dev/null
+++ b/contrib/zenith_test_utils/zenith_test_utils.control
@@ -0,0 +1,5 @@
+# zenith_test_utils extension
+comment = 'helpers for zenith testing and debugging'
+default_version = '1.0'
+module_pathname = '$libdir/zenith_test_utils'
+relocatable = true
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
new file mode 100644
index 00000000000..a7eb278a09b
--- /dev/null
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenithtest.c
+ *	  Helpers for zenith testing and debugging
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith_test_utils/zenithtest.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(test_consume_xids);
+
+/*
+ * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
+ */
+Datum
+test_consume_xids(PG_FUNCTION_ARGS)
+{
+	int32		nxids = PG_GETARG_INT32(0);
+	TransactionId topxid;
+	FullTransactionId fullxid;
+	TransactionId xid;
+	TransactionId targetxid;
+
+	/* make sure we have a top-XID first */
+	topxid = GetTopTransactionId();
+
+	xid = ReadNextTransactionId();
+
+	targetxid = xid + nxids;
+	while (targetxid < FirstNormalTransactionId)
+		targetxid++;
+
+	while (TransactionIdPrecedes(xid, targetxid))
+	{
+		fullxid = GetNewTransactionId(true);
+		xid = XidFromFullTransactionId(fullxid);
+		elog(DEBUG1, "topxid: %u xid: %u", topxid, xid);
+	}
+
+	PG_RETURN_VOID();
+}

From 9dedb55d21795f2d505f23503ad669504f2fdeb2 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 11 Aug 2021 08:49:54 +0300
Subject: [PATCH 030/167] [walproposer] Change condition for triggering
 recovery

---
 src/backend/replication/walproposer.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 837b751a24c..a03ca9952a0 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -212,7 +212,10 @@ HandleWalKeeperResponse(void)
 		WalMessage* msg = msgQueueHead;
 		msgQueueHead = msg->next;
 		if (restartLsn < msg->req.beginLsn)
+		{
+			Assert(restartLsn < msg->req.endLsn);
 			restartLsn = msg->req.endLsn;
+		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(WalKeeperRequest));
 		free(msg);
 	}
@@ -326,6 +329,8 @@ static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = serverInfo.timeline;
 	cmd.startpoint = startpos;
@@ -357,9 +362,11 @@ SendMessageToNode(int i, WalMessage* msg)
 		msg->req.restartLsn = restartLsn;
 		msg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
-		elog(LOG, "sending message with len %ld VCL=%X/%X to %d",
-					msg->size - sizeof(WalKeeperRequest),
-					(uint32) (msg->req.commitLsn >> 32), (uint32) msg->req.commitLsn, i);
+		elog(LOG, "sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %d",
+			 msg->size - sizeof(WalKeeperRequest),
+			 LSN_FORMAT_ARGS(msg->req.commitLsn),
+			 LSN_FORMAT_ARGS(restartLsn),
+			 i);
 
 		rc = WriteSocketAsync(walkeeper[i].sock, &msg->req, msg->size);
 		if (rc < 0)
@@ -743,8 +750,10 @@ WalProposerPoll(void)
 										);
 
 									/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-									if (restartLsn != prop.VCL)
+									if (restartLsn < prop.VCL)
 									{
+										elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
+											 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
 										/* Perform recovery */
 										if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
 											elog(FATAL, "Failed to recover state");

From eae6cb8fce7d15f8a36765c089e266e75867a0fb Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Thu, 5 Aug 2021 02:14:55 +0300
Subject: [PATCH 031/167] [contrib/zenith] Use authentication token passed as
 environment variable in connections to pageserver. Token is passed as
 cleartext password.

---
 contrib/zenith/libpagestore.c | 87 +++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 89bfbe1906c..142999a6a8e 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -50,10 +50,91 @@ page_server_api api = {
 static void
 zenith_connect()
 {
-	char	   *query;
-	int			ret;
+	char			 *query;
+	int				  ret;
+	char			 *auth_token;
+	char			 *err = NULL;
+	PQconninfoOption *conn_options;
+	PQconninfoOption *conn_option;
+	int 			 noptions = 0;
+
+    // this is heavily inspired by psql/command.c::do_connect
+	conn_options = PQconninfoParse(
+		page_server_connstring,
+	 	&err
+	);
+
+	if (conn_options == NULL) {
+		/* The error string is malloc'd, so we must free it explicitly */
+		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+		PQfreemem(err);
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+					errmsg("invalid connection string syntax: %s", errcopy)));
+	}
+
+	// Trying to populate pageserver connection string with auth token from environment.
+	// We are looking for password in with placeholder value like $ENV_VAR_NAME, so if password field is present 
+	// and starts with $ we try to fetch environment variable value and fail loudly if it is not set
+	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
+	{
+		noptions++;
+		if (strcmp(conn_option->keyword, "password") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+			{
+				// ensure that this is a template
+				if (strncmp(conn_option->val, "$", 1) != 0) {
+					ereport(
+						ERROR,
+						(
+							errcode(ERRCODE_CONNECTION_EXCEPTION),
+							errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])
+						)
+					);
+				}
+		
+				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
+				auth_token = getenv(&conn_option->val[1]);
+				if (!auth_token) {
+					ereport(
+						ERROR,
+						(
+							errcode(ERRCODE_CONNECTION_EXCEPTION),
+							errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])
+						)
+					);
+				} else {
+					zenith_log(LOG, "using auth token from environment passed via env");
+
+				// inspired by PQconninfoFree and conninfo_storeval
+				// so just free the old one and replace with freshly malloc'ed one
+				free(conn_option->val);
+				conn_option->val = strdup(auth_token);
+				}
+			}
+		}
+	}
+
+	// copy values from PQconninfoOption to key/value arrays because PQconnectdbParams accepts options this way
+	const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
+	const char **values = malloc((noptions + 1) * sizeof(*values));
+	int			 i = 0;
+	
+	for (i = 0; i < noptions; i++)
+	{
+		keywords[i] = conn_options[i].keyword;
+		values[i] = conn_options[i].val;
+	}
+	// add array terminator
+	keywords[i] = NULL;
+	values[i] = NULL;
+
+	pageserver_conn = PQconnectdbParams(keywords, values, false);
+	free(keywords);
+	free(values);
 
-	pageserver_conn = PQconnectdb(page_server_connstring);
+	PQconninfoFree(conn_options);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{

From 4c15b7cf3b514d8e4934468825628c35c8350b1e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 13 Aug 2021 14:01:07 +0300
Subject: [PATCH 032/167] [contrib/zenith] Fix race condition while WAL-logging
 page, leading to CRC errors.

zenith_wallog_page() would call log_newpage() on a buffer, while holding
merely a shared lock on the page. That's not cool, because another backend
could modify the page concurrently. We allow changing hint bits while
holding only a shared lock, and changes on FSM pages, at least. See comments
in XLogSaveBufferForHint() for discussion of this problem.

One instance of the race condition that I was able to capture on my laptop
happened like this:

1. Backend A: needs to evict an FSM page from the buffer cache to make
   room for a new page, and calls zenith_wallog_page() on it. That is
   done while holding a share lock on the page.

2. Backend A: XLogInsertRecord() computes the CRC of the FPI WAL record
   including the FSM page

3. Backend B: Updates the same FSM page while holding only a share lock

4. Backend A: Allocates space in the WAL buffers, and copies the WAL
   record header and the page to the buffers.

At this point, the CRC that backend A computed earlier doesn't match the
contents that were written out to the WAL buffers.

The update of the FSM page in backend B happened from there (fsmpage.c):

	/*
	 * Update the next-target pointer. Note that we do this even if we're only
	 * holding a shared lock, on the grounds that it's better to use a shared
	 * lock and get a garbled next pointer every now and then, than take the
	 * concurrency hit of an exclusive lock.
	 *
	 * Wrap-around is handled at the beginning of this function.
	 */
	fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);

To fix, make a temporary copy of the page in zenith_wallog_page(), and
WAL-log that. Just like XLogSaveBufferForHint() does.

Fixes https://github.com/zenithdb/zenith/issues/413
---
 contrib/zenith/pagestore_smgr.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 3d24cb79f5f..47a37b0687d 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -238,6 +238,25 @@ zm_to_string(ZenithMessage *msg)
 	return s.data;
 }
 
+/*
+ * Wrapper around log_newpage() that makes a temporary copy of the block and
+ * WAL-logs that. This makes it safe to use while holding only a shared lock
+ * on the page, see XLogSaveBufferForHint. We don't use XLogSaveBufferForHint
+ * directly because it skips the logging if the LSN is new enough.
+ */
+static XLogRecPtr
+log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
+				 Page page, bool page_std)
+{
+	PGAlignedBlock copied_buffer;
+
+	/* set the flag in the original page, like log_newpage() does. */
+	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
+
+	memcpy(copied_buffer.data, page, BLCKSZ);
+	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
+}
+
 
 static void
 zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
@@ -264,7 +283,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr recptr;
-		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
 		elog(SmgrTrace, "FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X",
@@ -284,7 +303,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 * Hopefully we do not evict actively used vm too often.
 		 */
 		XLogRecPtr recptr;
-		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
 
@@ -307,7 +326,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		 * TODO Do we have any special page types?
 		 */
 
-		recptr = log_newpage(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
+		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
 
 		/* If we wal-log hint bits, someone could concurrently update page
 		 * and reset PD_WAL_LOGGED again, so this assert is not relevant anymore.

From 209d0c08e22cf884d939d42dbea60f1e9e1a9f83 Mon Sep 17 00:00:00 2001
From: Max Sharnoff <github@max.sharnoff.org>
Date: Fri, 13 Aug 2021 11:23:16 -0700
Subject: [PATCH 033/167] [walproposer] Rework walkeeper protocol to use libpq
 (#60)

The majority of work here is going to be heavily cleaned up soon, but
it's worth giving a brief overview of the changes either way.

* Adds libpqwalproposer, serving a similar function to the existing
  libpqwalreceiver -- to provide access to libpq functions without
  causing problems from directly linking them.

* Adds two new state components, giving (a) the type of libpq-specific
  polling required to move on to the next protocol state and (b) the
  kind of socket events it's waiting on. (These are expected to be
  removed or heavily reworked soon.)

* Changes `WalProposerPoll` to make use of a slightly more specialized
  `AdvancePollState`, which has been completely reworked.
---
 src/Makefile                                  |    1 +
 .../replication/libpqwalproposer/Makefile     |   37 +
 .../libpqwalproposer/libpqwalproposer.c       |  327 +++++
 src/backend/replication/walproposer.c         | 1169 ++++++++++++-----
 src/backend/replication/walproposer_utils.c   |  285 ++--
 src/include/replication/walproposer.h         |  422 +++++-
 6 files changed, 1783 insertions(+), 458 deletions(-)
 create mode 100644 src/backend/replication/libpqwalproposer/Makefile
 create mode 100644 src/backend/replication/libpqwalproposer/libpqwalproposer.c

diff --git a/src/Makefile b/src/Makefile
index 79e274a4769..2f32e3d5137 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -22,6 +22,7 @@ SUBDIRS = \
 	include \
 	interfaces \
 	backend/replication/libpqwalreceiver \
+	backend/replication/libpqwalproposer \
 	backend/replication/pgoutput \
 	fe_utils \
 	bin \
diff --git a/src/backend/replication/libpqwalproposer/Makefile b/src/backend/replication/libpqwalproposer/Makefile
new file mode 100644
index 00000000000..c570160536f
--- /dev/null
+++ b/src/backend/replication/libpqwalproposer/Makefile
@@ -0,0 +1,37 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for src/backend/replication/libpqwalproposer
+#
+# IDENTIFICATION
+#    src/backend/replication/libpqwalproposer/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/replication/libpqwalproposer
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS)
+
+OBJS = \
+	$(WIN32RES) \
+	libpqwalproposer.o
+SHLIB_LINK_INTERNAL = $(libpq)
+SHLIB_LINK = $(filter -lintl, $(LIBS))
+SHLIB_PREREQS = submake-libpq
+PGFILEDESC = "libpqwalproposer - libpq interface for WAL proposer"
+NAME = libpqwalproposer
+
+all: all-shared-lib
+
+include $(top_srcdir)/src/Makefile.shlib
+
+install: all installdirs install-lib
+
+installdirs: installdirs-lib
+
+uninstall: uninstall-lib
+
+clean distclean maintainer-clean: clean-lib
+	rm -f $(OBJS)
diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
new file mode 100644
index 00000000000..a5d7fec1a33
--- /dev/null
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -0,0 +1,327 @@
+#include "replication/walproposer.h"
+#include "libpq-fe.h"
+
+/* Required for anything that's dynamically loaded */
+PG_MODULE_MAGIC;
+void _PG_init(void);
+
+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn* pg_conn;
+};
+
+/* Prototypes for exported functions */
+static char*							libpqprop_error_message(WalProposerConn* conn);
+static WalProposerConnStatusType		libpqprop_status(WalProposerConn* conn);
+static WalProposerConn*					libpqprop_connect_start(char* conninfo);
+static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn* conn);
+static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
+static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
+static int								libpqprop_set_nonblocking(WalProposerConn* conn, int arg);
+static pgsocket							libpqprop_socket(WalProposerConn* conn);
+static int								libpqprop_flush(WalProposerConn* conn);
+static int								libpqprop_consume_input(WalProposerConn* conn);
+static void								libpqprop_finish(WalProposerConn* conn);
+static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
+static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
+
+static WalProposerFunctionsType PQWalProposerFunctions = {
+	libpqprop_error_message,
+	libpqprop_status,
+	libpqprop_connect_start,
+	libpqprop_connect_poll,
+	libpqprop_send_query,
+	libpqprop_get_query_result,
+	libpqprop_set_nonblocking,
+	libpqprop_socket,
+	libpqprop_flush,
+	libpqprop_consume_input,
+	libpqprop_finish,
+	libpqprop_async_read,
+	libpqprop_async_write,
+};
+
+/* Module initialization */
+void
+_PG_init(void)
+{
+	if (WalProposerFunctions != NULL)
+		elog(ERROR, "libpqwalproposer already loaded");
+	WalProposerFunctions = &PQWalProposerFunctions;
+}
+
+/* Exported function definitions */
+static char*
+libpqprop_error_message(WalProposerConn* conn)
+{
+	return PQerrorMessage(conn->pg_conn);
+}
+
+static WalProposerConnStatusType
+libpqprop_status(WalProposerConn* conn)
+{
+	switch (PQstatus(conn->pg_conn))
+	{
+		case CONNECTION_OK:
+			return WP_CONNECTION_OK;
+		case CONNECTION_BAD:
+			return WP_CONNECTION_BAD;
+		default:
+			return WP_CONNECTION_IN_PROGRESS;
+	}
+}
+
+static WalProposerConn*
+libpqprop_connect_start(char* conninfo)
+{
+	WalProposerConn*	conn;
+	PGconn*				pg_conn;
+
+	pg_conn = PQconnectStart(conninfo);
+	/*
+	 * Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the
+	 * behavior of PQconnectStart here.
+	 */
+	if (!pg_conn)
+		return NULL;
+
+	/*
+	 * And in theory this allocation can fail as well, but it's incredibly unlikely if we just
+	 * successfully allocated a PGconn.
+	 *
+	 * palloc will exit on failure though, so there's not much we could do if it *did* fail.
+	 */
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	return conn;
+}
+
+static WalProposerConnectPollStatusType
+libpqprop_connect_poll(WalProposerConn* conn)
+{
+	WalProposerConnectPollStatusType return_val;
+
+	switch (PQconnectPoll(conn->pg_conn))
+	{
+		case PGRES_POLLING_FAILED:
+			return_val = WP_CONN_POLLING_FAILED;
+			break;
+		case PGRES_POLLING_READING:
+			return_val = WP_CONN_POLLING_READING;
+			break;
+		case PGRES_POLLING_WRITING:
+			return_val = WP_CONN_POLLING_WRITING;
+			break;
+		case PGRES_POLLING_OK:
+			return_val = WP_CONN_POLLING_OK;
+			break;
+
+		/* There's a comment at its source about this constant being unused. We'll expect it's never
+		 * returned. */
+		case PGRES_POLLING_ACTIVE:
+			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			/* This return is never actually reached, but it's here to make the compiler happy */
+			return WP_CONN_POLLING_FAILED;
+	}
+
+	return return_val;
+}
+
+static bool
+libpqprop_send_query(WalProposerConn* conn, char* query)
+{
+	int  result;
+	bool return_val;
+
+	switch ((result = PQsendQuery(conn->pg_conn, query)))
+	{
+		case 0:
+			return_val = false;
+			break;
+		case 1:
+			return_val = true;
+			break;
+		default:
+			elog(FATAL, "unexpected return %d from PQsendQuery", result);
+	}
+
+	return return_val;
+}
+
+static WalProposerExecStatusType
+libpqprop_get_query_result(WalProposerConn* conn)
+{
+	PGresult* result;
+	WalProposerExecStatusType return_val;
+
+	/* Marker variable if we need to log an unexpected success result */
+	char* unexpected_success = NULL;
+
+	if (PQisBusy(conn->pg_conn))
+		return WP_EXEC_NEEDS_INPUT;
+
+
+	result = PQgetResult(conn->pg_conn);
+	/* PQgetResult returns NULL only if getting the result was successful & there's no more of the
+	 * result to get. */
+	if (!result)
+	{
+		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		return WP_EXEC_UNEXPECTED_SUCCESS;
+	}
+
+	/* Helper macro to reduce boilerplate */
+	#define UNEXPECTED_SUCCESS(msg) \
+		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
+		unexpected_success = msg; \
+		break;
+
+
+	switch (PQresultStatus(result))
+	{
+		/* "true" success case */
+		case PGRES_COPY_BOTH:
+			return_val = WP_EXEC_SUCCESS_COPYBOTH;
+			break;
+
+		/* Unexpected success case */
+		case PGRES_EMPTY_QUERY:
+			UNEXPECTED_SUCCESS("empty query return");
+		case PGRES_COMMAND_OK:
+			UNEXPECTED_SUCCESS("data-less command end");
+		case PGRES_TUPLES_OK:
+			UNEXPECTED_SUCCESS("tuples return");
+		case PGRES_COPY_OUT:
+			UNEXPECTED_SUCCESS("'Copy Out' response");
+		case PGRES_COPY_IN:
+			UNEXPECTED_SUCCESS("'Copy In' response");
+		case PGRES_SINGLE_TUPLE:
+			UNEXPECTED_SUCCESS("single tuple return");
+		case PGRES_PIPELINE_SYNC:
+			UNEXPECTED_SUCCESS("pipeline sync point");
+
+		/* Failure cases */
+		case PGRES_BAD_RESPONSE:
+		case PGRES_NONFATAL_ERROR:
+		case PGRES_FATAL_ERROR:
+		case PGRES_PIPELINE_ABORTED:
+			return_val = WP_EXEC_FAILED;
+			break;
+	}
+
+	if (unexpected_success)
+		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+
+	return return_val;
+}
+
+static int
+libpqprop_set_nonblocking(WalProposerConn* conn, int arg)
+{
+	return PQsetnonblocking(conn->pg_conn, arg);
+}
+
+static pgsocket
+libpqprop_socket(WalProposerConn* conn)
+{
+	return PQsocket(conn->pg_conn);
+}
+
+static int
+libpqprop_flush(WalProposerConn* conn)
+{
+	return (PQflush(conn->pg_conn));
+}
+
+static int
+libpqprop_consume_input(WalProposerConn* conn)
+{
+	return (PQconsumeInput(conn->pg_conn));
+}
+
+static void
+libpqprop_finish(WalProposerConn* conn)
+{
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+static PGAsyncReadResult
+libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
+{
+	int result;
+
+	/* The docs for PQgetCopyData list the return values as:
+	 *      0 if the copy is still in progress, but no "complete row" is
+	 *        available
+	 *     -1 if the copy is done
+	 *     -2 if an error occured
+	 *  (> 0) if it was successful; that value is the amount transferred.
+	 *
+	 * The protocol we use between walproposer and walkeeper means that we
+	 * (i.e. walproposer) won't ever receive a message saying that the copy
+	 * is done. */
+	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
+	{
+		case 0:
+			return PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN;
+		case -1:
+			/* As mentioned above; this shouldn't happen */
+			elog(FATAL, "unexpected return -1 from PQgetCopyData");
+			break;
+		case -2:
+			return PG_ASYNC_READ_FAIL;
+		default:
+			/* Positive values indicate the size of the returned result */
+			*amount = result;
+			return PG_ASYNC_READ_SUCCESS;
+	}
+}
+
+static PGAsyncWriteResult
+libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
+{
+	int result;
+
+	/* The docs for PQputcopyData list the return values as:
+	 *   1 if the data was queued,
+	 *   0 if it was not queued because of full buffers, or
+	 *  -1 if an error occured
+	 */
+	switch (result = PQputCopyData(conn->pg_conn, buf, size))
+	{
+		case 1:
+			/* good -- continue */
+			break;
+		case 0:
+			/* FIXME: can this ever happen? the structure of walproposer
+			 * should always empty the connection's buffers before trying
+			 * to send more, right? */
+			return PG_ASYNC_WRITE_WOULDBLOCK;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQputCopyData", result);
+	}
+
+	/* After queueing the data, we still need to flush to get it to send.
+	 * This might take multiple tries, but we don't want to wait around
+	 * until it's done.
+	 *
+	 * PQflush has the following returns (directly quoting the docs):
+	 *   0 if sucessful,
+	 *   1 if it was unable to send all the data in the send queue yet
+	 *  -1 if it failed for some reason
+	 */
+	switch (result = PQflush(conn->pg_conn)) {
+		case 0:
+			return PG_ASYNC_WRITE_SUCCESS;
+		case 1:
+			return PG_ASYNC_WRITE_TRY_FLUSH;
+		case -1:
+			return PG_ASYNC_WRITE_FAIL;
+		default:
+			elog(FATAL, "invalid return %d from PQflush", result);
+	}
+}
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index a03ca9952a0..102ce033949 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -21,10 +21,15 @@
 #include "utils/memutils.h"
 #include "utils/timestamp.h"
 
+
 char* wal_acceptors_list;
 int   wal_acceptor_reconnect_timeout;
 bool  am_wal_proposer;
 
+
+/* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
+WalProposerFunctionsType* WalProposerFunctions = NULL;
+
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
 static int          n_walkeepers = 0;
@@ -43,6 +48,12 @@ static int          leader;     /* Most advanced walkeeper */
 static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
+static uint32       request_poll_immediate; /* bitset of walkeepers requesting AdvancePollState */
+
+/* Declarations of a few functions ahead of time, so that we can define them out of order. */
+static void AdvancePollState(int i, uint32 events);
+static bool ReadPGAsyncIntoValue(int i, void* value, size_t value_size);
+static void HackyRemoveWalProposerEvent(int to_remove);
 
 /*
  * Combine hot standby feedbacks from all walkeepers.
@@ -72,38 +83,115 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback* hs)
 	}
 }
 
+/* Initializes the internal event set, provided that it is currently null */
 static void
-ResetWalProposerEventSet(void)
+InitEventSet(void)
 {
 	if (waitEvents)
-		FreeWaitEventSet(waitEvents);
+		elog(FATAL, "double-initialization of event set");
+
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
+}
+
+/*
+ * Updates the stored wait event for the walkeeper, given its current sockWaitState
+ *
+ * remove_if_nothing specifies whether to remove the event if the new waiting set is empty. In
+ * certain cases, we have remove_if_nothing = false because it's known that the walkeeper state will
+ * be updated immediately after if it's not waiting for any events.
+ *
+ * In general, setting remove_if_nothing = false is just an optimization; setting it to true will
+ * almost always be correct. Please leave a comment arguing for the validity of this optimization if
+ * you use it.
+ */
+static void
+UpdateEventSet(int i, bool remove_if_nothing)
+{
+	uint32 events;
+	WalKeeper* wk = &walkeeper[i];
+
+	/*
+	 * If there isn't an applicable way to update the event, we just don't bother. This function is
+	 * sometimes called when the walkeeper isn't waiting for anything, and so the best thing to do
+	 * is just nothing.
+	 */
+	if (wk->sockWaitState != WANTS_NO_WAIT)
+	{
+		events = WaitKindAsEvents(wk->sockWaitState);
+
+		/* If we don't already have an event, add one! */
+		if (wk->eventPos == -1)
+			wk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(wk->conn), NULL, wk);
+		else
+			ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+	}
+	else if (remove_if_nothing && wk->eventPos != 1)
+		HackyRemoveWalProposerEvent(i);
+}
+
+/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+ */
+static void
+HackyRemoveWalProposerEvent(int to_remove)
+{
+	/* Remove the existing event set */
+	if (waitEvents) {
+		FreeWaitEventSet(waitEvents);
+		waitEvents = NULL;
+	}
+	/* Re-initialize it without adding any walkeeper events */
+	InitEventSet();
+
+	/* loop through the existing walkeepers. If they aren't the one we're removing, and if they have
+	 * a socket we can use, re-add the applicable events.
+	 *
+	 * We're expecting that there's no other walkeepers with `.sockWaitState = WANTS_NO_WAIT`,
+	 * because any state without waiting should should have been handled immediately. */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].sock != PGINVALID_SOCKET)
+		walkeeper[i].eventPos = -1;
+
+		if (i == to_remove)
+			continue;
+
+		if (walkeeper[i].conn)
 		{
-			int events;
-			switch (walkeeper[i].state)
+			UpdateEventSet(i, false);
+
+			if (walkeeper[i].sockWaitState == WANTS_NO_WAIT)
 			{
-				case SS_SEND_WAL:
-					events = WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE;
-					break;
-				case SS_CONNECTING:
-					events = WL_SOCKET_WRITEABLE;
-					break;
-				default:
-					events = WL_SOCKET_READABLE;
-					break;
+				elog(FATAL, "Unexpected walkeeper %s:%s in %s state waiting for nothing",
+					 walkeeper[i].host, walkeeper[i].port, FormatWalKeeperState(walkeeper[i].state));
+			}
+			else
+			{
+				UpdateEventSet(i, false); /* Will either add an event or do nothing */
 			}
-			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, events, walkeeper[i].sock, NULL, &walkeeper[i]);
 		}
 	}
 }
 
+/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
+static void
+ShutdownConnection(int i, bool remove_event)
+{
+	if (walkeeper[i].conn)
+		walprop_finish(walkeeper[i].conn);
+	walkeeper[i].conn = NULL;
+	walkeeper[i].state = SS_OFFLINE;
+	walkeeper[i].pollState = SPOLL_NONE;
+	walkeeper[i].sockWaitState = WANTS_NO_WAIT;
+
+	if (remove_event)
+		HackyRemoveWalProposerEvent(i);
+}
+
 /*
  * This function is called to establish new connection or to reestablish connection in case
  * of connection failure.
@@ -112,52 +200,74 @@ ResetWalProposerEventSet(void)
 static void
 ResetConnection(int i)
 {
-	bool established;
+	pgsocket sock; /* socket of the new connection */
+	WalKeeper *wk = &walkeeper[i];
 
-	if (walkeeper[i].state != SS_OFFLINE)
+	if (wk->state != SS_OFFLINE)
 	{
-		elog(WARNING, "Connection with node %s:%s failed: %m",
-			walkeeper[i].host, walkeeper[i].port);
-
-		/* Close old connection */
-		closesocket(walkeeper[i].sock);
-		walkeeper[i].sock = PGINVALID_SOCKET;
-		walkeeper[i].state = SS_OFFLINE;
-
-		/* Postgres wait event set API doesn't support deletion of events, so we have to reconstruct set */
-		ResetWalProposerEventSet();
+		elog(WARNING, "Connection with node %s:%s in %s state failed",
+			wk->host, wk->port, FormatWalKeeperState(wk->state));
+		ShutdownConnection(i, true);
 	}
 
-	/* Try to establish new connection */
-	walkeeper[i].sock = ConnectSocketAsync(walkeeper[i].host, walkeeper[i].port, &established);
-	if (walkeeper[i].sock != PGINVALID_SOCKET)
+	/* Try to establish new connection
+	 *
+	 * If the connection information hasn't been filled out, we need to do
+	 * that here. */
+	if (wk->conninfo[0] == '\0')
 	{
-		elog(LOG, "%s with node %s:%s",
-					established ? "Connected" : "Connecting", walkeeper[i].host, walkeeper[i].port);
+		sprintf((char*) &wk->conninfo,
+				"host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
+				wk->host, wk->port, zenith_timeline_walproposer);
+	}
 
+	wk->conn = walprop_connect_start((char*) &wk->conninfo);
 
-		if (established)
-		{
-			/* Start handshake: first of all send information about server */
-			if (WriteSocket(walkeeper[i].sock, &serverInfo, sizeof serverInfo))
-			{
-				walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_READABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
-				walkeeper[i].state = SS_HANDSHAKE;
-				walkeeper[i].asyncOffs = 0;
-			}
-			else
-			{
-				ResetConnection(i);
-			}
-		}
-		else
-		{
-			walkeeper[i].eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, walkeeper[i].sock, NULL, &walkeeper[i]);
-			walkeeper[i].state = SS_CONNECTING;
-		}
+	/* "If the result is null, then libpq has been unable to allocate a new PGconn structure" */
+	if (!wk->conn)
+		elog(FATAL, "failed to allocate new PGconn object");
+
+	/* The connection should always be non-blocking. It's easiest to just set that here. */
+	walprop_set_nonblocking(wk->conn, true);
+
+	/* PQconnectStart won't actually start connecting until we run PQconnectPoll. Before we do that
+	 * though, we need to check that it didn't immediately fail. */
+	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
+	{
+		/* According to libpq docs:
+		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed, typically
+		 *    because of invalid connection parameters."
+		 * We should report this failure.
+		 *
+		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */
+		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
+			 wk->conninfo, walprop_error_message(wk->conn));
+		/* Even though the connection failed, we still need to clean up the object */
+		walprop_finish(wk->conn);
+		wk->conn = NULL;
+		return;
 	}
-}
 
+	/* The documentation for PQconnectStart states that we should call PQconnectPoll in a loop until
+	 * it returns PGRES_POLLING_OK or PGRES_POLLING_FAILED. The other two possible returns indicate
+	 * whether we should wait for reading or writing on the socket. For the first iteration of the
+	 * loop, we're expected to wait until the socket becomes writable.
+	 *
+	 * The wording of the documentation is a little ambiguous; thankfully there's an example in the
+	 * postgres source itself showing this behavior.
+	 *   (see libpqrcv_connect, defined in
+	 *              src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
+	 */
+	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
+
+	wk->state = SS_CONNECTING;
+	wk->pollState = SPOLL_CONNECT;
+	wk->sockWaitState = WANTS_SOCK_WRITE;
+
+	sock = walprop_socket(wk->conn);
+	wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, wk);
+	return;
+}
 
 /*
  * Calculate WAL position acknowledged by quorum
@@ -241,7 +351,11 @@ WalProposerMain(Datum main_arg)
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
 
-		/* Load the libpq-specific functions */
+	/* Load the libpq-specific functions */
+	load_file("libpqwalproposer", false);
+	if (WalProposerFunctions == NULL)
+		elog(ERROR, "libpqwalproposer didn't initialize correctly");
+
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
@@ -267,7 +381,9 @@ WalProposerMain(Datum main_arg)
 		walkeeper[n_walkeepers].host = host;
 		walkeeper[n_walkeepers].port = port;
 		walkeeper[n_walkeepers].state = SS_OFFLINE;
-		walkeeper[n_walkeepers].sock = PGINVALID_SOCKET;
+		walkeeper[n_walkeepers].conn = NULL;
+		/* Set conninfo to empty. We'll fill it out once later, in `ResetConnection` as needed */
+		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		walkeeper[n_walkeepers].currMsg = NULL;
 		n_walkeepers += 1;
 	}
@@ -289,7 +405,7 @@ WalProposerMain(Datum main_arg)
 	if (*zenith_timeline_walproposer != '\0' &&
 	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
-	
+
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
@@ -306,7 +422,7 @@ WalProposerMain(Datum main_arg)
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
-	ResetWalProposerEventSet();
+	InitEventSet();
 
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
@@ -343,48 +459,38 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 SendMessageToNode(int i, WalMessage* msg)
 {
-	ssize_t rc;
+	WalKeeper* wk = &walkeeper[i];
 
 	/* If there is no pending message then send new one */
-	if (walkeeper[i].currMsg == NULL)
+	if (wk->currMsg == NULL)
 	{
 		/* Skip already acknowledged messages */
 		while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 			msg = msg->next;
 
-		walkeeper[i].currMsg = msg;
+		wk->currMsg = msg;
 	}
-	else
-		msg = walkeeper[i].currMsg;
 
-	if (msg != NULL)
+	/* Only try to send the message if it's non-null */
+	if (wk->currMsg)
 	{
-		msg->req.restartLsn = restartLsn;
-		msg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
+		wk->currMsg->req.restartLsn = restartLsn;
+		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
-		elog(LOG, "sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %d",
-			 msg->size - sizeof(WalKeeperRequest),
-			 LSN_FORMAT_ARGS(msg->req.commitLsn),
-			 LSN_FORMAT_ARGS(restartLsn),
-			 i);
+		/* Once we've selected and set up our message, actually start sending it. */
+		wk->state         = SS_SEND_WAL;
+		wk->pollState     = SPOLL_NONE;
+		wk->sockWaitState = WANTS_NO_WAIT;
+		/* Don't ned to update the event set; that's done by AdvancePollState */
 
-		rc = WriteSocketAsync(walkeeper[i].sock, &msg->req, msg->size);
-		if (rc < 0)
-		{
-			ResetConnection(i);
-		}
-		else if ((size_t)rc == msg->size) /* message was completely sent */
-		{
-			walkeeper[i].asyncOffs = 0;
-			walkeeper[i].state = SS_RECV_FEEDBACK;
-		}
-		else
-		{
-			/* wait until socket is available for write */
-			walkeeper[i].state = SS_SEND_WAL;
-			walkeeper[i].asyncOffs = rc;
-			ModifyWaitEvent(waitEvents, walkeeper[i].eventPos, WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE, NULL);
-		}
+		AdvancePollState(i, WL_NO_EVENTS);
+	}
+	else
+	{
+		wk->state         = SS_IDLE;
+		wk->pollState     = SPOLL_IDLE;
+		wk->sockWaitState = WANTS_SOCK_READ;
+		UpdateEventSet(i, true);
 	}
 }
 
@@ -549,10 +655,10 @@ TimeToReconnect(TimestampTz now)
 	return (long) (till_reconnect / 1000);
 }
 
+/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
 static void
 ReconnectWalKeepers(void)
 {
-	/* Initiate reconnect if timeout is expired */
 	TimestampTz now = GetCurrentTimestamp();
 	if (TimeToReconnect(now) == 0)
 	{
@@ -645,257 +751,726 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	return true;
 }
 
-/* Advance the WAL proposer state machine. */
+/* Requests the currently-running WalProposerPoll to advance the state of this walkeeper */
+static void
+RequestStateAdvanceNoPoll(int i)
+{
+	/* We only have to change the value here; it'll be detected in a call to
+	 * AdvancePollForAllRequested when that's made. */
+	request_poll_immediate |= (1 << i);
+}
+
+static void
+AdvancePollForAllRequested(void)
+{
+	uint32 poll_set = request_poll_immediate;
+
+	/*
+	 * We have this in a loop because -- in theory -- polling the requested states could produce
+	 * more that are ready to be polled, though this *really* shouldn't occur in practice.
+	 */
+	while ((poll_set = request_poll_immediate))
+	{
+		/* "Take responsibility" for the poll set. We don't want any possibility of other calls to
+		 * AdvancePollForAllRequested duplicating an AdvancePollState. */
+		request_poll_immediate = 0;
+
+		/*
+		 * Loop through all nonzero bits and call AdvancePollState
+		 *
+		 * FIXME: This can probably be much more efficient, using something like __builtin__clz.
+		 * Maybe it doesn't matter though.
+		 */
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			/* If the ith bit is set, that state requested advancement */
+			if (poll_set & (1 << i))
+				AdvancePollState(i, WL_NO_EVENTS);
+		}
+	}
+}
+
+/*
+ * Advance the WAL proposer state machine, waiting each time for events to occur
+ */
 void
 WalProposerPoll(void)
 {
 	while (true)
 	{
+		WalKeeper*  wk;
+		int         rc;
+		int         i;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
-		int rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-								  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		WalKeeper*  wk = (WalKeeper*) event.user_data;
-		int i = (int)(wk - walkeeper);
+
+		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+						&event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		wk = (WalKeeper*) event.user_data;
+		i = (int)(wk - walkeeper);
 
 		if (rc != 0)
 		{
-			/* communication with walkeepers */
-			if (event.events & WL_SOCKET_READABLE)
+			/*
+			 * If the event contains something that one of our walkeeper states
+			 * was waiting for, we'll advance its state.
+			 */
+			if (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+				AdvancePollState(i, event.events);
+
+			/*
+			 * It's possible for AdvancePollState to result in extra states
+			 * being ready to immediately advance to the next state (with
+			 * pollState = SPOLL_NONE). We deal with that here.
+			 */
+			AdvancePollForAllRequested();
+		}
+
+		/* If the timeout expired, attempt to reconnect to any walkeepers that we dropped */
+		ReconnectWalKeepers();
+
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (rc != 0 && (event.events & WL_LATCH_SET))
+		{
+			ResetLatch(MyLatch);
+			break;
+		}
+	}
+}
+
+/* Performs the logic for advancing the state machine of the 'i'th walkeeper, given that a certain
+ * set of events has occured. */
+static void
+AdvancePollState(int i, uint32 events)
+{
+	WalKeeper* wk = &walkeeper[i];
+
+	/* Continue polling all the while we don't need to wait.
+	 *
+	 * At the bottom of this function is "while (walkeeper[i].sockWaitState == WANTS_NO_WAIT)" */
+	do {
+		uint32 expected_events = WaitKindAsEvents(wk->sockWaitState);
+
+		/* If we were expecting SOME event but nothing happened, panic. */
+		if ((expected_events & events) == 0 && expected_events)
+		{
+			elog(FATAL,
+				 "unexpected event for WalKeeper poll. Expected %s, found code %s (see: FormatEvents).",
+				 FormatWKSockWaitKind(wk->sockWaitState), FormatEvents(events));
+		}
+
+		/* Now that we've checked the event is ok, we'll actually run the thing we're looking for */
+		switch (wk->pollState)
+		{
+			/* If the polling corresponds to a "full" operation, we'll skip straight to that - we
+			 * don't actually need to poll here. */
+			case SPOLL_NONE:
+			case SPOLL_RETRY:
+				/* Equivalent to 'break', but more descriptive. */
+				goto ExecuteNextProtocolState;
+
+			/* On idle polling states, we wait for the socket to open for reading. If this happens,
+			 * the connection has closed *normally*, so we're just done. */
+			case SPOLL_IDLE:
+				elog(LOG, "Walkeeper %s:%s closed connection from %s state",
+						wk->host, wk->port, FormatWalKeeperState(wk->state));
+				/* 'true' to remove existing event for this walkeeper */
+				ShutdownConnection(i, true);
+				return;
+
+			/* Call PQconnectPoll to finalize the connection */
+			case SPOLL_CONNECT:
 			{
-				switch (wk->state)
+				WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
+				pgsocket                         new_sock = walprop_socket(wk->conn);
+
+				switch (result)
 				{
-					case SS_HANDSHAKE:
-						/* Receive walkeeper node state */
-						rc = ReadSocketAsync(wk->sock,
-											 (char*)&wk->info + wk->asyncOffs,
-											 sizeof(wk->info) - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == sizeof(wk->info))
-						{
-							/* WalKeeper response completely received */
-
-							/* Check protocol version */
-							if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
-							{
-								elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
-									 wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
-								ResetConnection(i);
-							}
-							else
-							{
-								wk->state = SS_VOTING;
-								wk->feedback.flushLsn = restartLsn;
-								wk->feedback.hs.ts = 0;
-
-								/* Check if we have quorum */
-								if (++n_connected >= quorum)
-								{
-									if (n_connected == quorum)
-										StartElection();
-
-									/* Now send max-node-id to everyone participating in voting and wait their responses */
-									for (int j = 0; j < n_walkeepers; j++)
-									{
-										if (walkeeper[j].state == SS_VOTING)
-										{
-											if (!WriteSocket(walkeeper[j].sock, &prop, sizeof(prop)))
-											{
-												ResetConnection(j);
-											}
-											else
-											{
-												walkeeper[j].asyncOffs = 0;
-												walkeeper[j].state = SS_WAIT_VERDICT;
-											}
-										}
-									}
-								}
-							}
-						}
+					case WP_CONN_POLLING_OK:
+						elog(LOG, "Connected with node %s:%s", wk->host, wk->port);
+
+						/* If we're fully connected, we're good! We can move on to the next state */
+						wk->state = SS_EXEC_STARTWALPUSH;
+
+						/* Update the socket -- it might have changed */
+						HackyRemoveWalProposerEvent(i);
+
+						/* We need to just pick an event to wait on; this will be overriden
+						 * anyways later. */
+						wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, new_sock, NULL, wk);
+
+						/* We're done, but some of the other result cases have cleanup left to do */
+						goto ExecuteNextProtocolState;
+
+					case WP_CONN_POLLING_FAILED:
+						elog(WARNING, "Failed to connect to node '%s:%s': %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+						/* If connecting failed, we don't want to restart the connection because
+						 * that might run us into a loop. Instead, shut it down -- it'll naturally
+						 * restart at a slower interval on calls to ReconnectWalKeepers. */
+						ShutdownConnection(i, true);
+						return;
+
+					case WP_CONN_POLLING_READING:
+						wk->sockWaitState = WANTS_SOCK_READ;
 						break;
 
-					case SS_WAIT_VERDICT:
-						/* Receive walkeeper response for our candidate */
-						rc = ReadSocketAsync(wk->sock,
-											 (char*)&wk->info.server.nodeId + wk->asyncOffs,
-											 sizeof(wk->info.server.nodeId) - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == sizeof(wk->info.server.nodeId))
-						{
-							/* Response completely received */
-
-							/* If server accept our candidate, then it returns it in response */
-							if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
-							{
-								elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-									 wk->host, wk->port,
-									 wk->info.server.nodeId.term, prop.nodeId.term);
-							}
-							else
-							{
-								/* Handshake completed, do we have quorum? */
-								wk->state = SS_IDLE;
-								if (++n_votes == quorum)
-								{
-									elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
-										 quorum,
-										 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
-										);
-
-									/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-									if (restartLsn < prop.VCL)
-									{
-										elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
-											 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
-										/* Perform recovery */
-										if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
-											elog(FATAL, "Failed to recover state");
-									}
-									WalProposerStartStreaming(prop.VCL);
-									/* Should not return here */
-								}
-								else
-								{
-									/* We are already streaming WAL: send all pending messages to the attached walkeeper */
-									SendMessageToNode(i, msgQueueHead);
-								}
-							}
-						}
+					case WP_CONN_POLLING_WRITING:
+						wk->sockWaitState = WANTS_SOCK_WRITE;
 						break;
+				}
 
-					case SS_RECV_FEEDBACK:
-						/* Read walkeeper response with flushed WAL position */
-						rc = ReadSocketAsync(wk->sock,
-											 (char*)&wk->feedback + wk->asyncOffs,
-											 sizeof(wk->feedback) - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == sizeof(wk->feedback))
+				/* If we got here, we either have to wait for reading or
+				 * writing. The value of walkeeper[i].sockWaitState indicates
+				 * which one of these it is.
+				 *
+				 * We also have to update the socket here, even if the file
+				 * descriptor itself hasn't changed. It's possible for libpq to
+				 * close the socket and then open a new one, reusing the same
+				 * file descriptor. If this happens, epoll will have
+				 * automatically removed the socket, so we'll stop receiving
+				 * events for it unless we re-add the socket.
+				 *
+				 * To update the socket, we the event and add a new one back.
+				 */
+				HackyRemoveWalProposerEvent(i);
+
+				wk->eventPos = AddWaitEventToSet(waitEvents, WaitKindAsEvents(wk->sockWaitState), new_sock, NULL, wk);
+
+				/* We still have polling to do, so we can't move on to the next state. */
+				return;
+			}
+
+			case SPOLL_WRITE_PQ_FLUSH:
+			{
+				int flush_result;
+
+				/* If the socket is ready for reading, we have to call PQconsumeInput before
+				 * attempting to flush. */
+				if (events & WL_SOCKET_READABLE)
+				{
+					/* PQconsumeInput returns 1 if ok, 0 if there was an error */
+					if (!walprop_consume_input(wk->conn))
+					{
+						elog(WARNING, "Failed to pre-flush read input for node %s:%s in state [%s]: %s",
+							 wk->host, wk->port, FormatWalKeeperState(wk->state),
+							 walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+					}
+				}
+
+				/* PQflush returns:
+				 *   0 if uccessful,
+				 *   1 if unable to send everything yet,
+				 *  -1 if it failed */
+				switch (flush_result = walprop_flush(wk->conn))
+				{
+					case 0:
+						/* On success, go to the next state. Our current state only indicates the
+						 * state that *started* the writing, so we need to use that to figure out
+						 * what to do next. */
+						switch (wk->state)
 						{
-							WalMessage* next = wk->currMsg->next;
-							Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
-							wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
-							wk->state = SS_IDLE;
-							wk->asyncOffs = 0;
-							wk->currMsg = NULL;
-							HandleWalKeeperResponse();
-							SendMessageToNode(i, next);
-
-							/*
-							 * Also send the new VCL to all the walkeepers.
-							 *
-							 * FIXME: This is redundant for walkeepers that have other outbound messages
-							 * pending.
-							 */
-							if (true)
-							{
-								XLogRecPtr minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-								WalMessage *vclUpdateMsg;
-
-								if (minQuorumLsn > lastSentVCLLsn)
-								{
-									vclUpdateMsg = CreateMessageVCLOnly();
-									if (vclUpdateMsg)
-										BroadcastMessage(vclUpdateMsg);
-									lastSentVCLLsn = minQuorumLsn;
-								}
-							}
+							case SS_EXEC_STARTWALPUSH:
+								wk->state = SS_WAIT_EXEC_RESULT;
+								break;
+							case SS_HANDSHAKE_SEND:
+								wk->state = SS_HANDSHAKE_RECV;
+								break;
+							case SS_SEND_VOTE:
+								wk->state = SS_WAIT_VERDICT;
+								break;
+							case SS_SEND_WAL:
+								wk->state = SS_RECV_FEEDBACK;
+								break;
+							default:
+								elog(FATAL, "Unexpected writing state [%s] for node %s:%s",
+									FormatWalKeeperState(wk->state), wk->host, wk->port);
 						}
+
+						wk->pollState = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
 						break;
-					case SS_IDLE:
-						elog(WARNING, "WalKeeper %s:%s drops connection", wk->host, wk->port);
+					case 1:
+						/* Nothing more to do - we'll just have to wait until we can flush again */
+						return;
+					case -1:
+						elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+							 wk->host, wk->port, FormatWalKeeperState(wk->state),
+							 walprop_error_message(wk->conn));
 						ResetConnection(i);
 						break;
-
 					default:
-						elog(FATAL, "Unexpected walkeeper %s:%s read state %d", wk->host, wk->port, wk->state);
+						elog(FATAL, "invalid return %d from PQflush", flush_result);
 				}
+				break;
 			}
-			else if (event.events & WL_SOCKET_WRITEABLE)
+
+			case SPOLL_PQ_CONSUME_AND_RETRY:
+				/* PQconsumeInput returns 1 on success (though maybe nothing was read), and 0 on
+				 * failure. */
+				if (walprop_consume_input(wk->conn))
+					/* On success, retry the operation */
+					goto ExecuteNextProtocolState;
+				else
+				{
+					/* On failure, print the failure and move on */
+					elog(WARNING, "Failed to read input for node %s:%s in state %s: %s",
+						wk->host, wk->port, FormatWalKeeperState(wk->state),
+						walprop_error_message(wk->conn));
+					ResetConnection(i);
+					return;
+				}
+		}
+
+ExecuteNextProtocolState:
+		/* If we get here, walkeeper[i].pollState now corresponds to either SPOLL_NONE or
+		 * SPOLL_RETRY. In either case, we should execute the operation described by the high-level
+		 * state.
+		 *
+		 * All of the cases in this switch statement are provided in the order that state
+		 * transitions happen, moving downwards. So `SS_CONNECTING` moves into
+		 * `SS_EXEC_STARTWALPUSH`, `SS_EXEC_STARTWALPUSH` moves into `SS_WAIT_EXEC_RESULT`, etc.
+		 *
+		 * If/when new states are added, they should abide by the same formatting.
+		 *
+		 * More information about the high-level flow between states is available in the comments
+		 * for WalKeeperState. */
+		switch (wk->state)
+		{
+			/* walkeepers aren't taken out of SS_OFFLINE by polling. */
+			case SS_OFFLINE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* Connecting is handled by the SPOLL_CONNECT, which then puts us into
+			 * SS_EXEC_STARTWALPUSH. There's no singular state advancement to be made here. */
+			case SS_CONNECTING:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is connecting", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* Send "START_WAL_PUSH" command to the walkeeper. After sending, wait for response with
+			 * SS_WAIT_EXEC_RESULT */
+			case SS_EXEC_STARTWALPUSH:
 			{
-				switch (wk->state)
+				int flush_result;
+
+				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
+				{
+					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+					ResetConnection(i);
+					return;
+				}
+
+				/* The query has been started (put into buffers), but hasn't been flushed yet. We
+				 * should do that now. If there's more flushing required, keep doing that until it's
+				 * done */
+				switch ((flush_result = walprop_flush(wk->conn)))
+				{
+					case 0:
+						/* success -- go to the next state */
+						wk->state = SS_WAIT_EXEC_RESULT;
+						wk->pollState = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+					case 1:
+						/* we'll have to flush again */
+						wk->pollState = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+					case -1:
+						elog(WARNING, "Failed to flush write to node %s:%s to exec command: %s",
+								wk->host, wk->port, walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+					default:
+						elog(FATAL, "invalid return %d from PQflush", flush_result);
+				}
+
+				/* If no waiting is required, we'll get to that shortly */
+				UpdateEventSet(i, false);
+				break;
+			}
+
+			/* Waiting for the result of the "START_WAL_PUSH" command. If successful, proceed to
+			 * SS_HANDSHAKE_SEND. If needs more, wait until we can read and retry. */
+			case SS_WAIT_EXEC_RESULT:
+				/* Call our wrapper around PQisBusy + PQgetResult to inspect the result */
+				switch (walprop_get_query_result(wk->conn))
+				{
+					/* Successful result, move on to starting the handshake */
+					case WP_EXEC_SUCCESS_COPYBOTH:
+						wk->state         = SS_HANDSHAKE_SEND;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+
+					/* We need more calls to PQconsumeInput to completely receive this result */
+					case WP_EXEC_NEEDS_INPUT:
+						wk->pollState     = SPOLL_PQ_CONSUME_AND_RETRY;
+						wk->sockWaitState = WANTS_SOCK_READ;
+						break;
+
+					case WP_EXEC_FAILED:
+						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
+								wk->host, wk->port, walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+
+					/* Unexpected result -- funamdentally an error, but we want to produce a custom
+					 * message, rather than a generic "something went wrong" */
+					case WP_EXEC_UNEXPECTED_SUCCESS:
+						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
+								wk->host, wk->port);
+						ResetConnection(i);
+						break;
+				}
+
+				/* If the wait state is empty, don't remove the event -- we have more work to do */
+				UpdateEventSet(i, false);
+
+				break;
+
+			/* Start handshake: first of all send information about server */
+			case SS_HANDSHAKE_SEND:
+				/* Note: This state corresponds to the process of sending the relevant information
+				 * along. The moment we finish sending, we use SS_HANDSHAKE_RECV to complete the
+				 * handshake. */
+				switch (walprop_async_write(wk->conn, &serverInfo, sizeof(serverInfo)))
 				{
-					case SS_CONNECTING:
+					case PG_ASYNC_WRITE_SUCCESS:
+						/* If the write immediately succeeds, we can move on to the next state. */
+						wk->state         = SS_HANDSHAKE_RECV;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+
+					case PG_ASYNC_WRITE_WOULDBLOCK:
+						/* Wait until the socket is write-ready and try again */
+						wk->pollState     = SPOLL_RETRY;
+						wk->sockWaitState = WANTS_SOCK_WRITE;
+						break;
+
+					case PG_ASYNC_WRITE_TRY_FLUSH:
+						/* We need to call PQflush some number of additional times, with different
+						 * actions depending on whether the socket is readable or writable */
+						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+
+					case PG_ASYNC_WRITE_FAIL:
+						/* On failure, print the error and reset the connection */
+						elog(WARNING, "Handshake with node %s:%s failed to start: %s",
+								wk->host, wk->port, walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+				}
+
+				/* Update the event set for this walkeeper, depending on what it's been changed to
+				 *
+				 * We set remove_if_nothing = false because we'll immediately execute
+				 * SS_HANDSHAKE_RECV on the next iteration of the outer loop. */
+				UpdateEventSet(i, false);
+				break;
+
+			/* Finish handshake comms: receive information about the walkeeper */
+			case SS_HANDSHAKE_RECV:
+				/* If our reading doesn't immediately succeed, any necessary error handling or state
+				 * setting is taken care of. We can leave any other work until later. */
+				if (!ReadPGAsyncIntoValue(i, &wk->info, sizeof(wk->info)))
+					return;
+
+				/* Check protocol version */
+				if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
+				{
+					elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
+							wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
+					ResetConnection(i);
+					return;
+				}
+
+				/* Protocol is all good, move to voting */
+				wk->state     = SS_VOTING;
+				wk->pollState = SPOLL_IDLE;
+				wk->feedback.flushLsn = restartLsn;
+				wk->feedback.hs.ts = 0;
+
+				/* Check if we have quorum. If there aren't enough walkeepers, wait and do nothing.
+				 * We'll eventually get a task when the election starts.
+				 *
+				 * If we do have quorum, we can start an election */
+				if (++n_connected >= quorum)
+				{
+					if (n_connected == quorum)
+						StartElection();
+
+					/* Now send max-node-id to everyone participating in voting and wait their responses */
+					for (int j = 0; j < n_walkeepers; j++)
 					{
-						int			optval = 0;
-						ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
-						if (getsockopt(wk->sock, SOL_SOCKET, SO_ERROR, (char *) &optval, &optlen) < 0 || optval != 0)
+						/* Remember: SS_VOTING indicates that the walkeeper is participating in
+						 * voting, but hasn't sent anything yet. The ones that have sent something
+						 * are given SS_SEND_VOTE or SS_WAIT_VERDICT. */
+						if (walkeeper[j].state == SS_VOTING)
 						{
-							elog(WARNING, "Failed to connect to node '%s:%s': %s",
-								 wk->host, wk->port,
-								 strerror(optval));
-							closesocket(wk->sock);
-							wk->sock =  PGINVALID_SOCKET;
-							wk->state = SS_OFFLINE;
-							ResetWalProposerEventSet();
-						}
-						else
-						{
-							uint32 len = 0;
-							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-							/*
-							 * Start handshake: send information about server.
-							 * First of all send 0 as package size: it allows walkeeper to distinguish
-							 * wal_proposer's connection from standard replication connection from pagers.
-							 */
-							if (WriteSocket(wk->sock, &len, sizeof len)
-								&& WriteSocket(wk->sock, &serverInfo, sizeof serverInfo))
-							{
-								wk->state = SS_HANDSHAKE;
-								wk->asyncOffs = 0;
-							}
-							else
-							{
-								ResetConnection(i);
-							}
+							walkeeper[j].state = SS_SEND_VOTE;
+							walkeeper[j].pollState = SPOLL_NONE;
+							walkeeper[j].sockWaitState = WANTS_NO_WAIT;
+
+							/* If this isn't the current walkeeper, defer handling this state until
+							 * later. We'll mark it for individual work in WalProposerPoll. */
+							if (j != i)
+								RequestStateAdvanceNoPoll(j);
 						}
+					}
+				}
+				break;
+
+			/* Voting is an idle state - we don't expect any events to trigger. Refer to the
+			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
+			 * SS_SEND_VOTE. */
+			case SS_VOTING:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* We have quorum for voting, send our vote request */
+			case SS_SEND_VOTE:
+				switch (walprop_async_write(wk->conn, &prop, sizeof(prop)))
+				{
+					case PG_ASYNC_WRITE_SUCCESS:
+						/* If the write immediately succeeds, we can move on to the next state. */
+						wk->state         = SS_WAIT_VERDICT;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+					case PG_ASYNC_WRITE_WOULDBLOCK:
+						/* Wait until the socket is write-ready and try again */
+						wk->pollState     = SPOLL_RETRY;
+						wk->sockWaitState = WANTS_SOCK_WRITE;
 						break;
+					case PG_ASYNC_WRITE_TRY_FLUSH:
+						/* We need to call PQflush some number of additional times, with different
+						 * actions depending on whether the socket is readable or writable */
+						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+					case PG_ASYNC_WRITE_FAIL:
+						/* Report the failure and reset the connection; there isn't much
+						 * more we can do. */
+						elog(WARNING, "Failed to send vote request to node %s:%s: %s",
+								wk->host, wk->port,
+								walprop_error_message(wk->conn));
+						ResetConnection(i);
+						return;
+				}
+
+				/* Don't remove from the event set if there's nothing we're waiting for; we'll get
+				 * it on the next iteration of the loop */
+				UpdateEventSet(i, false);
+				break;
+
+			/* Start reading the walkeeper response for our candidate */
+			case SS_WAIT_VERDICT:
+				/* If our reading doesn't immediately succeed, any necessary error handling or state
+				 * setting is taken care of. We can leave any other work until later. */
+				if (!ReadPGAsyncIntoValue(i, &wk->info.server.nodeId, sizeof(wk->info.server.nodeId)))
+					return;
+
+				/* If server accept our candidate, then it returns it in response */
+				if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+				{
+					elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+						wk->host, wk->port,
+						wk->info.server.nodeId.term, prop.nodeId.term);
+				}
+
+				/* Handshake completed, do we have quorum? */
+				wk->state         = SS_IDLE;
+				wk->pollState     = SPOLL_IDLE;
+				wk->sockWaitState = WANTS_NO_WAIT;
+
+				if (++n_votes == quorum)
+				{
+					elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
+						 quorum,
+						 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
+						);
+
+					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+					if (restartLsn < prop.VCL)
+					{
+						elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
+							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
+						/* Perform recovery */
+						if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+							elog(FATAL, "Failed to recover state");
 					}
+					WalProposerStartStreaming(prop.VCL);
+					/* Should not return here */
+				}
+				else
+				{
+					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+					SendMessageToNode(i, msgQueueHead);
+				}
 
-					case SS_SEND_WAL:
-						rc = WriteSocketAsync(wk->sock, (char*)&wk->currMsg->req + wk->asyncOffs, wk->currMsg->size - wk->asyncOffs);
-						if (rc < 0)
-						{
-							ResetConnection(i);
-						}
-						else if ((wk->asyncOffs += rc) == wk->currMsg->size)
-						{
-							/* WAL block completely sent */
-							wk->state = SS_RECV_FEEDBACK;
-							wk->asyncOffs = 0;
-							ModifyWaitEvent(waitEvents, wk->eventPos, WL_SOCKET_READABLE, NULL);
-						}
+				break;
+
+			/* Start to send the message at wk->currMsg. Triggered only by calls to
+			 * SendMessageToNode */
+			case SS_SEND_WAL:
+			{
+				WalMessage* msg = wk->currMsg;
+
+				/* Don't repeat logs if we have to retry the actual send operation itself */
+				if (wk->pollState != SPOLL_RETRY)
+				{
+					elog(LOG, "Sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %s:%s",
+						 msg->size - sizeof(WalKeeperRequest),
+						 LSN_FORMAT_ARGS(msg->req.commitLsn),
+						 LSN_FORMAT_ARGS(restartLsn),
+						 wk->host, wk->port);
+				}
+
+				switch (walprop_async_write(wk->conn, &msg->req, msg->size))
+				{
+					case PG_ASYNC_WRITE_SUCCESS:
+						wk->state         = SS_RECV_FEEDBACK;
+						wk->pollState     = SPOLL_NONE;
+						wk->sockWaitState = WANTS_NO_WAIT;
+						break;
+					case PG_ASYNC_WRITE_WOULDBLOCK:
+						wk->pollState = SPOLL_RETRY;
+						wk->sockWaitState = WANTS_SOCK_WRITE;
 						break;
+					case PG_ASYNC_WRITE_TRY_FLUSH:
+						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
+						wk->sockWaitState = WANTS_SOCK_EITHER;
+						break;
+					case PG_ASYNC_WRITE_FAIL:
+						elog(WARNING, "Failed to send WAL to node %s:%s: %s",
+							 wk->host, wk->port, walprop_error_message(wk->conn));
+				}
 
-					default:
-						elog(FATAL, "Unexpected write state %d", wk->state);
+				/* Don't remove if if sockWaitState == WANTS_NO_WAIT, because we'll immediately move
+				 * on to SS_RECV_FEEDBACK if that's the case. */
+				UpdateEventSet(i, false);
+				break;
+			}
+
+			/* Start to receive the feedback from a message sent via SS_SEND_WAL */
+			case SS_RECV_FEEDBACK:
+			{
+				WalMessage* next;
+				XLogRecPtr  minQuorumLsn;
+				WalMessage* vclUpdateMsg;
+
+				/* If our reading doesn't immediately succeed, any necessary error handling or state
+				 * setting is taken care of. We can leave any other work until later. */
+				if (!ReadPGAsyncIntoValue(i, &wk->feedback, sizeof(wk->feedback)))
+					return;
+
+				next = wk->currMsg->next;
+				Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
+				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+
+				wk->state         = SS_IDLE;
+				wk->pollState     = SPOLL_IDLE;
+				wk->sockWaitState = WANTS_NO_WAIT;
+				/* Don't update the event set; that's handled by SendMessageToNode if necessary */
+
+				wk->currMsg = NULL;
+				HandleWalKeeperResponse();
+				SendMessageToNode(i, next);
+
+				/*
+				 * Also send the new VCL to all the walkeepers.
+				 *
+				 * FIXME: This is redundant for walkeepers that have other outbound messages
+				 * pending.
+				 */
+				minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+
+				if (minQuorumLsn > lastSentVCLLsn)
+				{
+					vclUpdateMsg = CreateMessageVCLOnly();
+					if (vclUpdateMsg)
+						BroadcastMessage(vclUpdateMsg);
+					lastSentVCLLsn = minQuorumLsn;
 				}
+				break;
 			}
+
+			/* Truly an idle state - there isn't any typ of advancement expected here. */
+			case SS_IDLE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
+				break; /* actually unreachable; makes the compiler happier */
 		}
-		ReconnectWalKeepers();
 
-		/*
-		 * If wait is terminated by latch set (walsenders' latch is set on
-		 * each wal flush), then exit loop. (no need for pm death check due to
-		 * WL_EXIT_ON_PM_DEATH)
-		 */
-		if (rc != 0 && (event.events & WL_LATCH_SET))
-		{
-			ResetLatch(MyLatch);
+		/* On subsequent iterations of the loop, there's no additonal events to process */
+		events = WL_NO_EVENTS;
+	} while (walkeeper[i].sockWaitState == WANTS_NO_WAIT && walkeeper[i].pollState != SPOLL_IDLE);
+}
+
+/*
+ * Reads a CopyData block into a value, returning whether the read was successful
+ *
+ * If the read was not immediately successful (either polling is required, or it actually failed),
+ * then the state is set appropriately on the walkeeper.
+ */
+bool
+ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
+{
+	WalKeeper* wk = &walkeeper[i];
+	char *buf = NULL;
+	int buf_size = -1;
+
+	switch (walprop_async_read(wk->conn, &buf, &buf_size))
+	{
+		/* On success, there's just a couple more things we'll check below */
+		case PG_ASYNC_READ_SUCCESS:
 			break;
-		}
+
+		case PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN:
+			wk->pollState = SPOLL_PQ_CONSUME_AND_RETRY;
+
+			if (wk->sockWaitState != WANTS_SOCK_READ)
+			{
+				wk->sockWaitState = WANTS_SOCK_READ;
+				UpdateEventSet(i, true);
+			}
+			return false;
+
+		case PG_ASYNC_READ_FAIL:
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s",
+				wk->host, wk->port,
+				FormatWalKeeperState(wk->state),
+				walprop_error_message(wk->conn));
+			ResetConnection(i);
+			return false;
 	}
-}
 
+	/*
+	 * If we get here, the read was ok, but we still need to check it was the right amount
+	 */
+	if (buf_size != value_size)
+	{
+		elog(FATAL,
+			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
+			wk->host, wk->port,
+			FormatWalKeeperState(wk->state),
+			sizeof(wk->info.server.nodeId), buf_size);
+	}
+
+	/* Copy the resulting info into place */
+	memcpy(value, buf, buf_size);
+	return true;
+}
 
 /*
  * WalProposerRegister
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index cea41ef01cc..722fa66d5e6 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -1,6 +1,7 @@
 #include "replication/walproposer.h"
 #include "common/logging.h"
 #include "common/ip.h"
+#include "../interfaces/libpq/libpq-fe.h"
 #include <netinet/tcp.h>
 #include <unistd.h>
 
@@ -28,170 +29,166 @@ CompareLsn(const void *a, const void *b)
 		return 1;
 }
 
-static bool
-SetSocketOptions(pgsocket sock)
+/* Converts a `WKSockWaitKind` into the bit flags that would match it
+ * 
+ * Note: For `wait_kind = WANTS_NO_WAIT`, this will return a value of zero,
+ * which does not match any events. Attempting to wait on no events will
+ * always timeout, so it's best to double-check the value being provided to
+ * this function where necessary. */
+uint32
+WaitKindAsEvents(WKSockWaitKind wait_kind)
 {
-	int on = 1;
-	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
-				   (char *) &on, sizeof(on)) < 0)
-	{
-		elog(WARNING, "setsockopt(TCP_NODELAY) failed: %m");
-		closesocket(sock);
-		return false;
-	}
-	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-				   (char *) &on, sizeof(on)) < 0)
-	{
-		elog(WARNING, "setsockopt(SO_REUSEADDR) failed: %m");
-		closesocket(sock);
-		return false;
-	}
-	if (!pg_set_noblock(sock))
-	{
-		elog(WARNING, "faied to switch socket to non-blocking mode: %m");
-		closesocket(sock);
-		return false;
-	}
-	return true;
-}
+	uint32 return_val;
 
-pgsocket
-ConnectSocketAsync(char const* host, char const* port, bool* established)
-{
-	struct addrinfo *addrs = NULL,
-		*addr,
-		hints;
-	int	ret;
-	pgsocket sock = PGINVALID_SOCKET;
-
-	hints.ai_flags = AI_PASSIVE;
-	hints.ai_family = AF_UNSPEC;
-	hints.ai_socktype = SOCK_STREAM;
-	hints.ai_protocol = 0;
-	hints.ai_addrlen = 0;
-	hints.ai_addr = NULL;
-	hints.ai_canonname = NULL;
-	hints.ai_next = NULL;
-	ret = pg_getaddrinfo_all(host, port, &hints, &addrs);
-	if (ret || !addrs)
-	{
-		elog(WARNING, "Could not resolve \"%s\": %s",
-					 host, gai_strerror(ret));
-		return -1;
-	}
-	for (addr = addrs; addr; addr = addr->ai_next)
+	switch (wait_kind)
 	{
-		sock = socket(addr->ai_family, SOCK_STREAM, 0);
-		if (sock == PGINVALID_SOCKET)
-		{
-			elog(WARNING, "could not create socket: %m");
-			continue;
-		}
-		if (!SetSocketOptions(sock))
-			continue;
-
-		/*
-		 * Bind it to a kernel assigned port on localhost and get the assigned
-		 * port via getsockname().
-		 */
-		while ((ret = connect(sock, addr->ai_addr, addr->ai_addrlen)) < 0 && errno == EINTR);
-		if (ret < 0)
-		{
-			if (errno == EINPROGRESS)
-			{
-				*established = false;
-				break;
-			}
-			elog(WARNING, "Could not establish connection to %s:%s: %m",
-						 host, port);
-			closesocket(sock);
-		}
-		else
-		{
-			*established = true;
-			break;
-		}
+		case WANTS_NO_WAIT:
+			return_val = WL_NO_EVENTS;
+			break;
+		case WANTS_SOCK_READ:
+			return_val = WL_SOCKET_READABLE;
+			break;
+		case WANTS_SOCK_WRITE:
+			return_val = WL_SOCKET_WRITEABLE;
+			break;
+		case WANTS_SOCK_EITHER:
+			return_val = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
 	}
-	return sock;
+
+	return return_val;
 }
-ssize_t
-ReadSocketAsync(pgsocket sock, void* buf, size_t size)
+
+/* Returns a human-readable string corresonding to the WalKeeperState
+ *
+ * The string should not be freed.
+ *
+ * The strings are intended to be used as a prefix to "state", e.g.:
+ *
+ *   elog(LOG, "currently in %s state", FormatWalKeeperState(wk->state));
+ *
+ * If this sort of phrasing doesn't fit the message, instead use something like:
+ *
+ *   elog(LOG, "currently in state [%s]", FormatWalKeeperState(wk->state));
+ */
+char*
+FormatWalKeeperState(WalKeeperState state)
 {
-	size_t offs = 0;
+	char* return_val;
 
-	while (size != offs)
+	switch (state)
 	{
-		ssize_t rc = recv(sock, (char*)buf + offs, size - offs, 0);
-		if (rc < 0)
-		{
-			if (errno == EINTR)
-				continue;
-			if (errno == EAGAIN || errno == EWOULDBLOCK)
-				return offs;
-			elog(WARNING, "Socket write failed: %m");
-			return -1;
-		}
-		else if (rc == 0)
-		{
-			elog(WARNING, "Connection was closed by peer");
-			return -1;
-		}
-		offs += rc;
+		case SS_OFFLINE:
+			return_val = "offline";
+			break;
+		case SS_CONNECTING:
+			return_val = "connecting";
+			break;
+		case SS_EXEC_STARTWALPUSH:
+			return_val = "sending 'START_WAL_PUSH' query";
+			break;
+		case SS_WAIT_EXEC_RESULT:
+			return_val = "receiving query result";
+			break;
+		case SS_HANDSHAKE_SEND:
+			return_val = "handshake (sending)";
+			break;
+		case SS_HANDSHAKE_RECV:
+			return_val = "handshake (receiving)";
+			break;
+		case SS_VOTING:
+			return_val = "voting";
+			break;
+		case SS_SEND_VOTE:
+			return_val = "sending vote";
+			break;
+		case SS_WAIT_VERDICT:
+			return_val = "wait-for-verdict";
+			break;
+		case SS_IDLE:
+			return_val = "idle";
+			break;
+		case SS_SEND_WAL:
+			return_val = "WAL-sending";
+			break;
+		case SS_RECV_FEEDBACK:
+			return_val = "WAL-feedback-receiving";
+			break;
 	}
-	return offs;
+
+	return return_val;
 }
 
-ssize_t
-WriteSocketAsync(pgsocket sock, void const* buf, size_t size)
+/* Returns a human-readable string corresponding to the WKSockWaitKind
+ *
+ * The string should not be freed. */
+char*
+FormatWKSockWaitKind(WKSockWaitKind wait_kind)
 {
-	size_t offs = 0;
+	char* return_val;
 
-	while (size != offs)
+	switch (wait_kind)
 	{
-		ssize_t rc = send(sock, (char const*)buf + offs, size - offs, 0);
-		if (rc < 0)
-		{
-			if (errno == EINTR)
-				continue;
-			if (errno == EAGAIN || errno == EWOULDBLOCK)
-				return offs;
-			elog(WARNING, "Socket write failed: %m");
-			return -1;
-		}
-		else if (rc == 0)
-		{
-			elog(WARNING, "Connection was closed by peer");
-			return -1;
-		}
-		offs += rc;
+		case WANTS_NO_WAIT:
+			return_val = "<no events>";
+			break;
+		case WANTS_SOCK_READ:
+			return_val = "<read event>";
+			break;
+		case WANTS_SOCK_WRITE:
+			return_val = "<write event>";
+			break;
+		case WANTS_SOCK_EITHER:
+			return_val = "<read or write event>";
+			break;
 	}
-	return offs;
+
+	return return_val;
 }
 
-bool
-WriteSocket(pgsocket sock, void const* buf, size_t size)
+/* Returns a human-readable string corresponding to the event set
+ *
+ * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
+ * returned string may be meaingless.
+ *
+ * The string should not be freed. It should also not be expected to remain the same between
+ * function calls. */
+char*
+FormatEvents(uint32 events)
 {
-	char* src = (char*)buf;
-
-	while (size != 0)
+	static char return_str[8];
+
+	/* Helper variable to check if there's extra bits */
+	uint32 all_flags = WL_LATCH_SET
+		| WL_SOCKET_READABLE
+		| WL_SOCKET_WRITEABLE
+		| WL_TIMEOUT
+		| WL_POSTMASTER_DEATH
+		| WL_EXIT_ON_PM_DEATH
+		| WL_SOCKET_CONNECTED;
+
+	/* The formatting here isn't supposed to be *particularly* useful -- it's just to give an
+	 * sense of what events have been triggered without needing to remember your powers of two. */
+
+	return_str[0] = (events & WL_LATCH_SET       ) ? 'L' : '_';
+	return_str[1] = (events & WL_SOCKET_READABLE ) ? 'R' : '_';
+	return_str[2] = (events & WL_SOCKET_WRITEABLE) ? 'W' : '_';
+	return_str[3] = (events & WL_TIMEOUT         ) ? 'T' : '_';
+	return_str[4] = (events & WL_POSTMASTER_DEATH) ? 'D' : '_';
+	return_str[5] = (events & WL_EXIT_ON_PM_DEATH) ? 'E' : '_';
+	return_str[5] = (events & WL_SOCKET_CONNECTED) ? 'C' : '_';
+
+	if (events & (~all_flags))
 	{
-		ssize_t rc = send(sock, src, size, 0);
-		if (rc < 0)
-		{
-			if (errno == EINTR)
-				continue;
-			elog(WARNING, "Socket write failed: %m");
-			return false;
-		}
-		else if (rc == 0)
-		{
-			elog(WARNING, "Connection was closed by peer");
-			return false;
-		}
-		size -= rc;
-		src += rc;
+		elog(WARNING, "Event formatting found unexpected component %d",
+				events & (~all_flags));
+		return_str[6] = '*';
+		return_str[7] = '\0';
 	}
-	return true;
+	else
+		return_str[6] = '\0';
+
+	return (char *) &return_str;
 }
 
 /*
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index d770473ad35..b7b35e876e5 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -2,10 +2,12 @@
 #define __WALKEEPER_H__
 
 #include "postgres.h"
+#include "port.h"
 #include "access/xlog_internal.h"
 #include "access/transam.h"
 #include "nodes/replnodes.h"
 #include "utils/uuid.h"
+#include "replication/walreceiver.h"
 
 #define SK_MAGIC              0xCafeCeefu
 #define SK_PROTOCOL_VERSION   1
@@ -15,29 +17,240 @@
 #define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
 #define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
 
+/*
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
+ * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
+ */
+#ifndef WL_NO_EVENTS
+#define WL_NO_EVENTS 0
+#else
+#error "WL_NO_EVENTS already defined"
+#endif
+
 extern char* wal_acceptors_list;
 extern int   wal_acceptor_reconnect_timeout;
 extern bool  am_wal_proposer;
 
+struct WalProposerConn; /* Defined in libpqwalproposer */
+typedef struct WalProposerConn WalProposerConn;
+
 struct WalMessage;
 typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
 extern char *zenith_tenant_walproposer;
 
-/* WAL safekeeper state */
+/* Possible return values from ReadPGAsync */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+	/* The read is ongoing. Wait until the connection is read-ready, then
+	 * call PQconsumeInput and try again. */
+	PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from WritePGAsync */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+	/* There wasn't space in the buffers to queue the data; wait until the
+	 * socket is write-ready and try again. */
+	PG_ASYNC_WRITE_WOULDBLOCK,
+	/* The write started, but you'll need to call PQflush some more times
+	 * to finish it off. We just tried, so it's best to wait until the
+	 * connection is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
+/* WAL safekeeper state - high level */
 typedef enum
 {
+	/*
+	 * Does not have an active connection and will stay that way until
+	 * further notice. May be paired with:
+	 *   - SPOLL_NONE
+	 *
+	 * Moves to SS_CONNECTING only by calls to ResetConnection.
+	 */
 	SS_OFFLINE,
+	/*
+	 * Currently in the process of connecting. May be paired with:
+	 *   - SPOLL_CONNECT
+	 *
+	 * After the connection is made, moves to SS_EXEC_STARTWALPUSH.
+	 */
 	SS_CONNECTING,
-	SS_HANDSHAKE,
+	/*
+	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper. May be paired with:
+	 *   - SPOLL_NONE
+	 *   - SPOLL_WRITE_PQ_FLUSH
+	 *
+	 * After the query sends, moves to SS_WAIT_EXEC_RESULT.
+	 */
+	SS_EXEC_STARTWALPUSH,
+	/*
+	 * Waiting for the result of the "START_WAL_PUSH" command. May be paired with:
+	 *   - SPOLL_PQ_CONSUME_AND_RETRY
+	 *
+	 * We only pair with PQconsumeInput because we *need* to wait until the socket is open for
+	 * reading to try again.
+	 *
+	 * After we get a successful result, moves to SS_HANDSHAKE_SEND.
+	 */
+	SS_WAIT_EXEC_RESULT,
+	/*
+	 * Executing the sending half of the handshake. May be paired with:
+	 *   - SPOLL_WRITE_PQ_FLUSH if it hasn't finished sending,
+	 *   - SPOLL_RETRY          if buffers are full and we just need to try again,
+	 *   - SPOLL_NONE
+	 *
+	 * After sending, moves to SS_HANDSHAKE_RECV.
+	 */
+	SS_HANDSHAKE_SEND,
+	/*
+	 * Executing the receiving half of the handshake. May be paired with:
+	 *   - SPOLL_PQ_CONSUME_AND_RETRY if we need more input
+	 *   - SPOLL_NONE
+	 *
+	 * After receiving, moves to SS_VOTING.
+	 */
+	SS_HANDSHAKE_RECV,
+	/*
+	 * Currently participating in voting, but a quorum hasn't yet been reached. Idle state. May be
+	 * paired with:
+	 *   - SPOLL_IDLE
+	 *
+	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of SS_HANDSHAKE_RECV.
+	 */
 	SS_VOTING,
+	/*
+	 * Currently sending the assigned vote
+	 */
+	SS_SEND_VOTE,
+	/*
+	 * Sent voting information, waiting to receive confirmation from the node. May be paired with:
+	 *   - SPOLL_WRITE_PQ_FLUSH
+	 *
+	 * After receiving, moves to SS_IDLE.
+	 */
 	SS_WAIT_VERDICT,
+	/*
+	 * Waiting for quorum to send WAL. Idle state. May be paired with:
+	 *  - SPOLL_IDLE
+	 *
+	 * Moves to SS_SEND_WAL only by calls to SendMessageToNode.
+	 */
 	SS_IDLE,
+	/*
+	 * Currently sending the message at currMsg. This state is only ever reached through calls to
+	 * SendMessageToNode. May be paired with:
+	 *   - SPOLL_WRITE_PQ_FLUSH
+	 *   - SPOLL_NONE
+	 *
+	 * After sending, moves to SS_RECV_FEEDBACK.
+	 */
 	SS_SEND_WAL,
-	SS_RECV_FEEDBACK
+	/*
+	 * Currently reading feedback from sending the WAL. May be paired with:
+	 *   - SPOLL_PQ_CONSUME_AND_RETRY
+	 *   - SPOLL_NONE
+	 *
+	 * After reading, moves to (SS_SEND_WAL or SS_IDLE) by calls to
+	 * SendMessageToNode.
+	 */
+	SS_RECV_FEEDBACK,
 } WalKeeperState;
 
+/* WAL safekeeper state - individual level
+ * 
+ * This type encompasses the type of polling necessary to move on to the
+ * next `WalKeeperState` from the current. It's things like "we need to
+ * call PQflush some more", or "retry the current operation".
+ */
+typedef enum
+{
+	/*
+	 * The current state is the one we want to be in; we just haven't run
+	 * the code for it. It should be processed with AdvancePollState to
+	 * start to advance to the next state.
+	 *
+	 * Expected WKSockWaitKind: WANTS_NO_WAIT.
+	 *
+	 * Note! This polling state is different from the others: its attached
+	 * WalKeeperState is what *will* be executed, not what just was.
+	 */
+	SPOLL_NONE,
+	/*
+	 * We need to retry the operation once the socket permits it
+	 *
+	 * Expected WKSockWaitKind: Any of WANTS_SOCK_READ, WANTS_SOCK_WRITE,
+	 * WANTS_SOCK_EITHER -- operation dependent.
+	 */
+	SPOLL_RETRY,
+	/*
+	 * Marker for states that do not expect to be advanced by calls to AdvancePollState. Not to be
+	 * confused with SS_IDLE, which carries a different (but related) meaning.
+	 *
+	 * For this polling state, we interpret any read-readiness on the socket as an indication that
+	 * the connection has closed normally.
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_READ
+	 */
+	SPOLL_IDLE,
+	/*
+	 * We need to repeat calls to PQconnectPoll. This is only available for
+	 * SS_CONNECTING
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_READ or WANTS_SOCK_WRITE
+	 */
+	SPOLL_CONNECT,
+	/* Poll with PQflush, finishing up a call to WritePGAsync. Always
+	 * combined with writing states, like SS_HANDSHAKE_SEND or SS_SEND_WAL.
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_EITHER
+	 */
+	SPOLL_WRITE_PQ_FLUSH,
+	/*
+	 * Get input with PQconsumeInput and try the operation again. This is
+	 * always combined with reading states -- like SS_HANDSHAKE_RECV or
+	 * SS_WAIT_VERDICT, and the operation repetition helps to reduce the
+	 * amount of repeated logic.
+	 *
+	 * Expected WKSockWaitKind: WANTS_SOCK_READ
+	 */
+	SPOLL_PQ_CONSUME_AND_RETRY,
+} WalKeeperPollState;
+
+/* The state of the socket that we're waiting on. This is used to
+ * double-check for polling that the socket we're being handed is correct.
+ *
+ * Used in the sockWaitState field of WalKeeper, in combination with the
+ * WalKeeperPollState.
+ *
+ * Each polling state above lists the set of values that they accept. */
+typedef enum
+{
+	/* No waiting is required for the poll state */
+	WANTS_NO_WAIT,
+	/* Polling should resume only once the socket is ready for reading */
+	WANTS_SOCK_READ,
+	/* Polling should resume only once the socket is ready for writing */
+	WANTS_SOCK_WRITE,
+	/* Polling should resume once the socket is ready for reading or
+	 * writing */
+	WANTS_SOCK_EITHER,
+} WKSockWaitKind;
+
 /*
  * Unique node identifier used by Paxos
  */
@@ -58,7 +271,7 @@ typedef struct ServerInfo
 	uint64     systemId;          /* Postgres system identifier */
 	uint8	   ztimelineid[16];   /* Zenith timeline id */
 	XLogRecPtr walEnd;
-    TimeLineID timeline;
+	TimeLineID timeline;
 	int        walSegSize;
 	uint8      ztenantid[16];
 } ServerInfo;
@@ -120,6 +333,12 @@ struct WalMessage
 	uint32 size;           /* message size */
 	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
 	WalKeeperRequest req; /* request to walkeeper (message header) */
+
+	/* PHANTOM FIELD:
+	 *
+	 * All WalMessages are allocated with exactly (size - sizeof(WalKeeperRequest)) additional bytes
+	 * after them, containing the body of the message. This allocation is done in `CreateMessage`
+	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
 };
 
 /*
@@ -138,24 +357,29 @@ typedef struct WalKeeperResponse
  */
 typedef struct WalKeeper
 {
-    char const* host;
-    char const* port;
-	pgsocket    sock;     /* socket descriptor */
-	WalMessage* currMsg;  /* message been send to the receiver */
-	int         asyncOffs;/* offset for asynchronus read/write operations */
-	int         eventPos; /* position in wait event set */
-	WalKeeperState state;/* walkeeper state machine state */
-    WalKeeperInfo  info; /* walkeeper info */
-	WalKeeperResponse feedback; /* feedback to master */
+	char const*        host;
+	char const*        port;
+	char               conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
+	WalProposerConn*   conn;          /* postgres protocol connection to the walreceiver */
+
+	WalMessage*        currMsg;       /* message been send to the receiver */
+
+	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
+	WalKeeperState     state;         /* walkeeper state machine state */
+	WalKeeperPollState pollState;     /* what kind of polling is necessary to advance `state` */
+	WKSockWaitKind     sockWaitState; /* what state are we expecting the socket to be in for
+									     the polling required? */
+	WalKeeperInfo      info;          /* walkeeper info */
+	WalKeeperResponse  feedback;      /* feedback to master */
 } WalKeeper;
 
 
 int        CompareNodeId(NodeId* id1, NodeId* id2);
-pgsocket   ConnectSocketAsync(char const* host, char const* port, bool* established);
-bool       WriteSocket(pgsocket sock, void const* buf, size_t size);
-ssize_t    ReadSocketAsync(pgsocket sock, void* buf, size_t size);
-ssize_t    WriteSocketAsync(pgsocket sock, void const* buf, size_t size);
 int        CompareLsn(const void *a, const void *b);
+uint32     WaitKindAsEvents(WKSockWaitKind wait_kind);
+char*      FormatWalKeeperState(WalKeeperState state);
+char*      FormatWKSockWaitKind(WKSockWaitKind wait_kind);
+char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
 bool       HexDecodeString(uint8 *result, char *input, int nbytes);
@@ -173,4 +397,168 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									uint32		feedbackCatalogEpoch);
 void       StartReplication(StartReplicationCmd *cmd);
 
+/* libpqwalproposer hooks & helper type */
+
+/* Re-exported PostgresPollingStatusType */
+typedef enum
+{
+	WP_CONN_POLLING_FAILED = 0,
+	WP_CONN_POLLING_READING,
+	WP_CONN_POLLING_WRITING,
+	WP_CONN_POLLING_OK,
+	/*
+	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
+	 * We've removed it here to avoid clutter.
+	 */
+} WalProposerConnectPollStatusType;
+
+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+	/* Any success result other than a single CopyBoth was received. The specifics of the result
+	 * were already logged, but it may be useful to provide an error message indicating which
+	 * walkeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set. */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+	/* No result available at this time. Wait until read-ready, call PQconsumeInput, then try again.
+	 * Internally, this is returned when PQisBusy indicates that PQgetResult would block. */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
+/* Re-exported ConnStatusType */
+typedef enum
+{
+	WP_CONNECTION_OK,
+	WP_CONNECTION_BAD,
+
+	/*
+	 * The original ConnStatusType has many more tags, but requests that
+	 * they not be relied upon (except for displaying to the user). We
+	 * don't need that extra functionality, so we collect them into a
+	 * single tag here.
+	 */
+	WP_CONNECTION_IN_PROGRESS,
+} WalProposerConnStatusType;
+
+/* Re-exported PQerrorMessage */
+typedef char* (*walprop_error_message_fn) (WalProposerConn* conn);
+
+/* Re-exported PQstatus */
+typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn);
+
+/* Re-exported PQconnectStart */
+typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
+
+/* Re-exported PQconectPoll */
+typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
+
+/* Re-exported PQsendQuery */
+typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
+
+/* Wrapper around PQisBusy + PQgetResult */
+typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
+
+/* Re-exported PQsetnonblocking */
+typedef int (*walprop_set_nonblocking_fn) (WalProposerConn* conn, int arg);
+
+/* Re-exported PQsocket */
+typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
+
+/* Re-exported PQflush */
+typedef int (*walprop_flush_fn) (WalProposerConn* conn);
+
+/* Re-exported PQconsumeInput */
+typedef int (*walprop_consume_input_fn) (WalProposerConn* conn);
+
+/* Re-exported PQfinish */
+typedef void (*walprop_finish_fn) (WalProposerConn* conn);
+
+/*
+ * Ergonomic wrapper around PGgetCopyData
+ *
+ * Reads a CopyData block from a walkeeper, setting *amount to the number
+ * of bytes returned.
+ *
+ * This function is allowed to assume certain properties specific to the
+ * protocol with the walkeepers, so it should not be used as-is for any
+ * other purpose.
+ *
+ * Note: If possible, using <ReadPGAsyncIntoValue> is generally preferred,
+ * because it performs a bit of extra checking work that's always required
+ * and is normally somewhat verbose.
+ */
+typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
+													char** buf,
+													int* amount);
+
+/*
+ * Ergonomic wrapper around PQputCopyData + PQflush
+ *
+ * Starts to write a CopyData block to a walkeeper.
+ *
+ * For information on the meaning of return codes, refer to PGAsyncWriteResult.
+ */
+typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
+													  void const* buf,
+													  size_t size);
+
+/* All libpqwalproposer exported functions collected together. */
+typedef struct WalProposerFunctionsType
+{
+	walprop_error_message_fn	walprop_error_message;
+	walprop_status_fn			walprop_status;
+	walprop_connect_start_fn	walprop_connect_start;
+	walprop_connect_poll_fn		walprop_connect_poll;
+	walprop_send_query_fn		walprop_send_query;
+	walprop_get_query_result_fn	walprop_get_query_result;
+	walprop_set_nonblocking_fn  walprop_set_nonblocking;
+	walprop_socket_fn			walprop_socket;
+	walprop_flush_fn			walprop_flush;
+	walprop_consume_input_fn	walprop_consume_input;
+	walprop_finish_fn			walprop_finish;
+	walprop_async_read_fn		walprop_async_read;
+	walprop_async_write_fn		walprop_async_write;
+} WalProposerFunctionsType;
+
+/* Allow the above functions to be "called" with normal syntax */
+#define walprop_error_message(conn) \
+	WalProposerFunctions->walprop_error_message(conn)
+#define walprop_status(conn) \
+	WalProposerFunctions->walprop_status(conn)
+#define walprop_connect_start(conninfo) \
+	WalProposerFunctions->walprop_connect_start(conninfo)
+#define walprop_connect_poll(conn) \
+	WalProposerFunctions->walprop_connect_poll(conn)
+#define walprop_send_query(conn, query) \
+	WalProposerFunctions->walprop_send_query(conn, query)
+#define walprop_get_query_result(conn) \
+	WalProposerFunctions->walprop_get_query_result(conn)
+#define walprop_set_nonblocking(conn, arg) \
+	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
+#define walprop_socket(conn) \
+	WalProposerFunctions->walprop_socket(conn)
+#define walprop_flush(conn) \
+	WalProposerFunctions->walprop_flush(conn)
+#define walprop_consume_input(conn) \
+	WalProposerFunctions->walprop_consume_input(conn)
+#define walprop_finish(conn) \
+	WalProposerFunctions->walprop_finish(conn)
+#define walprop_async_read(conn, buf, amount) \
+	WalProposerFunctions->walprop_async_read(conn, buf, amount)
+#define walprop_async_write(conn, buf, size) \
+	WalProposerFunctions->walprop_async_write(conn, buf, size)
+
+/*
+ * The runtime location of the libpqwalproposer functions.
+ *
+ * This pointer is set by the initializer in libpqwalproposer, so that we
+ * can use it later.
+ */
+extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
+
 #endif

From 743848ee6f89c5d5db78203820fe35e6081f41d2 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 16 Aug 2021 13:19:42 +0300
Subject: [PATCH 034/167] Build zenithdb/compute-node:latest in CI
 (zenithdb/console#125)

---
 .circleci/config.yml | 32 +++++++++++++++++++
 .dockerignore        |  5 +++
 Dockerfile           | 74 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 .circleci/config.yml
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile

diff --git a/.circleci/config.yml b/.circleci/config.yml
new file mode 100644
index 00000000000..ad48e5ac396
--- /dev/null
+++ b/.circleci/config.yml
@@ -0,0 +1,32 @@
+version: 2.1
+
+jobs:
+
+  # Build zenithdb/compute-node:latest image and push it to Docker hub
+  docker_image:
+    docker:
+      - image: cimg/base:2021.04
+    working_directory: ~/repo
+    steps:
+      - checkout:
+          path: ~/repo
+      - setup_remote_docker:
+          docker_layer_caching: true
+      - run:
+          name: Build and push Docker image
+          command: |
+            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
+            docker build -t zenithdb/compute-node:latest . && docker push zenithdb/compute-node:latest
+
+workflows:
+  version: 2
+  compute_node:
+    jobs:
+      # Build and push image only for commits to `main`.
+      - docker_image:
+          # Context gives an ability to login
+          context: 'Docker Hub'
+          filters:
+            branches:
+              only:
+                - main
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000000..530192a3b20
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,5 @@
+.git
+.vscode
+.circleci
+tmp_install
+compute_build
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000000..83407413142
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,74 @@
+#
+# Image with pre-built tools
+#
+FROM zenithdb/compute-tools:latest AS compute-deps
+# Only to get ready apply_conf binary as a dep
+
+#
+# Image with Postgres build deps
+#
+FROM debian:buster-slim AS build-deps
+
+RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
+                                          libcurl4-openssl-dev
+
+#
+# Image with built Postgres
+#
+FROM build-deps AS pg-build
+
+# Add user postgres
+RUN adduser postgres
+RUN mkdir /pg && chown postgres:postgres /pg
+
+# Copy source files
+COPY . /pg/
+
+# Build and install Postgres locally
+RUN mkdir /pg/compute_build && cd /pg/compute_build && \
+    ../configure CFLAGS='-O0 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --enable-cassert --enable-depend && \
+    # Install main binaries and contribs
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/zenith install && \
+    # Install headers
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
+
+USER postgres
+WORKDIR /pg
+
+#
+# Final compute node image to be exported
+#
+FROM debian:buster-slim
+
+# libreadline-dev is required to run psql
+RUN apt-get update && apt-get -yq install openssh-server libreadline-dev && \
+    # This will prepare everything needed by sshd
+    # like generation host keys with ssh-keygen -A
+    service ssh start
+
+# Add user postgres
+RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
+    echo "postgres:test_console_pass" | chpasswd && \
+    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres/compute && \
+    chown -R postgres:postgres /var/db/postgres/specs && \
+    chmod 0750 /var/db/postgres/compute
+
+# Copy ready Postgres binaries
+COPY --from=pg-build /pg/compute_build/postgres_bin /var/db/postgres/install
+
+# Copy apply_conf binary
+COPY --from=compute-deps /usr/local/bin/apply_conf /usr/local/bin/apply_conf
+
+# Copy postgres binaries to the common location
+RUN cp /var/db/postgres/install/bin/* /usr/local/bin/ && \
+    cp -r /var/db/postgres/install/share/* /usr/local/share/ && \
+    # Add postgres shared objects to the search path
+    echo '/var/db/postgres/install/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+
+# To be able to run sshd (seems to be default)
+# USER root
+
+ENTRYPOINT ["/bin/sh"]

From 657c3ed0ba6d080b2c252f5a58c7bc6052ec7b72 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Mon, 23 Aug 2021 10:31:38 +0300
Subject: [PATCH 035/167] zenith_regression_tests.patch Add alternative output
 for tablespace test, because tablespaces are not supported in zenith yet

---
 src/test/regress/output/tablespace_1.source | 941 ++++++++++++++++++++
 1 file changed, 941 insertions(+)
 create mode 100644 src/test/regress/output/tablespace_1.source

diff --git a/src/test/regress/output/tablespace_1.source b/src/test/regress/output/tablespace_1.source
new file mode 100644
index 00000000000..1c3b75cb6d1
--- /dev/null
+++ b/src/test/regress/output/tablespace_1.source
@@ -0,0 +1,941 @@
+-- create a tablespace using WITH clause
+CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (some_nonexistent_parameter = true); -- fail
+ERROR:  unrecognized parameter "some_nonexistent_parameter"
+CREATE TABLESPACE regress_tblspacewith LOCATION '@testtablespace@' WITH (random_page_cost = 3.0); -- ok
+-- check to see the parameter was used
+SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith';
+       spcoptions       
+------------------------
+ {random_page_cost=3.0}
+(1 row)
+
+-- drop the tablespace so we can re-use the location
+DROP TABLESPACE regress_tblspacewith;
+-- create a tablespace we can use
+CREATE TABLESPACE regress_tblspace LOCATION '@testtablespace@';
+-- try setting and resetting some properties for the new tablespace
+ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1);
+ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true);  -- fail
+ERROR:  unrecognized parameter "some_nonexistent_parameter"
+ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail
+ERROR:  RESET must not include values for parameters
+ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok
+-- REINDEX (TABLESPACE)
+-- catalogs and system tablespaces
+-- system catalog, fail
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_am;
+ERROR:  cannot move system relation "pg_am_name_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am;
+ERROR:  cannot reindex system catalogs concurrently
+-- shared catalog, fail
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid;
+ERROR:  cannot move system relation "pg_authid_rolname_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid;
+ERROR:  cannot reindex system catalogs concurrently
+-- toast relations, fail
+REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index;
+ERROR:  cannot move system relation "pg_toast_1260_index"
+REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index;
+ERROR:  cannot reindex system catalogs concurrently
+REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260;
+ERROR:  cannot move system relation "pg_toast_1260_index"
+REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260;
+ERROR:  cannot reindex system catalogs concurrently
+-- system catalog, fail
+REINDEX (TABLESPACE pg_global) TABLE pg_authid;
+ERROR:  cannot move system relation "pg_authid_rolname_index"
+REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid;
+ERROR:  cannot reindex system catalogs concurrently
+-- table with toast relation
+CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text);
+INSERT INTO regress_tblspace_test_tbl (num1, num2, t)
+  SELECT round(random()*100), random(), 'text'
+  FROM generate_series(1, 10) s(i);
+CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1);
+-- move to global tablespace, fail
+REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx;
+ERROR:  only shared relations can be placed in pg_global tablespace
+REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx;
+ERROR:  cannot move non-shared relation to tablespace "pg_global"
+-- check transactional behavior of REINDEX (TABLESPACE)
+BEGIN;
+REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx;
+REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl;
+ROLLBACK;
+-- no relation moved to the new tablespace
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace';
+ relname 
+---------
+(0 rows)
+
+-- check that all indexes are moved to a new tablespace with different
+-- relfilenode.
+-- Save first the existing relfilenode for the toast and main relations.
+SELECT relfilenode as main_filenode FROM pg_class
+  WHERE relname = 'regress_tblspace_test_tbl_idx' \gset
+SELECT relfilenode as toast_filenode FROM pg_class
+  WHERE oid =
+    (SELECT i.indexrelid
+       FROM pg_class c,
+            pg_index i
+       WHERE i.indrelid = c.reltoastrelid AND
+             c.relname = 'regress_tblspace_test_tbl') \gset
+REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace;
+ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+-- Move back to the default tablespace.
+ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+ relname 
+---------
+(0 rows)
+
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl;
+SELECT c.relname FROM pg_class c, pg_tablespace s
+  WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'
+  ORDER BY c.relname;
+            relname            
+-------------------------------
+ regress_tblspace_test_tbl_idx
+(1 row)
+
+SELECT relfilenode = :main_filenode AS main_same FROM pg_class
+  WHERE relname = 'regress_tblspace_test_tbl_idx';
+ main_same 
+-----------
+ f
+(1 row)
+
+SELECT relfilenode = :toast_filenode as toast_same FROM pg_class
+  WHERE oid =
+    (SELECT i.indexrelid
+       FROM pg_class c,
+            pg_index i
+       WHERE i.indrelid = c.reltoastrelid AND
+             c.relname = 'regress_tblspace_test_tbl');
+ toast_same 
+------------
+ f
+(1 row)
+
+DROP TABLE regress_tblspace_test_tbl;
+-- REINDEX (TABLESPACE) with partitions
+-- Create a partition tree and check the set of relations reindexed
+-- with their new tablespace.
+CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1);
+CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part
+  FOR VALUES FROM (0) TO (10) PARTITION BY list (c2);
+CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0
+  FOR VALUES IN (1);
+CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0
+  FOR VALUES IN (2);
+-- This partitioned table will have no partitions.
+CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part
+   FOR VALUES FROM (10) TO (20) PARTITION BY list (c2);
+-- Create some partitioned indexes
+CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1);
+CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1);
+ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0;
+-- This partitioned index will have no partitions.
+CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1);
+ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10;
+CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1);
+ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1;
+CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1);
+ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2;
+SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index')
+  ORDER BY relid, level;
+             relid              |         parentrelid          | level 
+--------------------------------+------------------------------+-------
+ tbspace_reindex_part_index     |                              |     0
+ tbspace_reindex_part_index_0   | tbspace_reindex_part_index   |     1
+ tbspace_reindex_part_index_10  | tbspace_reindex_part_index   |     1
+ tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 |     2
+ tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 |     2
+(5 rows)
+
+-- Track the original tablespace, relfilenode and OID of each index
+-- in the tree.
+CREATE TEMP TABLE reindex_temp_before AS
+  SELECT oid, relname, relfilenode, reltablespace
+  FROM pg_class
+    WHERE relname ~ 'tbspace_reindex_part_index';
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part;
+-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check
+-- based on the relation name below.
+SELECT b.relname,
+       CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged'
+       ELSE 'relfilenode has changed' END AS filenode,
+       CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged'
+       ELSE 'reltablespace has changed' END AS tbspace
+  FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname
+  ORDER BY 1;
+            relname             |         filenode         |          tbspace           
+--------------------------------+--------------------------+----------------------------
+ tbspace_reindex_part_index     | relfilenode is unchanged | reltablespace is unchanged
+ tbspace_reindex_part_index_0   | relfilenode is unchanged | reltablespace is unchanged
+ tbspace_reindex_part_index_0_1 | relfilenode has changed  | reltablespace has changed
+ tbspace_reindex_part_index_0_2 | relfilenode has changed  | reltablespace has changed
+ tbspace_reindex_part_index_10  | relfilenode is unchanged | reltablespace is unchanged
+(5 rows)
+
+DROP TABLE tbspace_reindex_part;
+-- create a schema we can use
+CREATE SCHEMA testschema;
+-- try a table
+CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'foo';
+ relname |     spcname      
+---------+------------------
+ foo     | regress_tblspace
+(1 row)
+
+INSERT INTO testschema.foo VALUES(1);
+INSERT INTO testschema.foo VALUES(2);
+-- tables from dynamic sources
+CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'asselect';
+ relname  |     spcname      
+----------+------------------
+ asselect | regress_tblspace
+(1 row)
+
+PREPARE selectsource(int) AS SELECT $1;
+CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace
+    AS EXECUTE selectsource(2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'asexecute';
+  relname  |     spcname      
+-----------+------------------
+ asexecute | regress_tblspace
+(1 row)
+
+-- index
+CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace;
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname = 'foo_idx';
+ relname |     spcname      
+---------+------------------
+ foo_idx | regress_tblspace
+(1 row)
+
+-- check \d output
+\d testschema.foo
+              Table "testschema.foo"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ i      | integer |           |          | 
+Indexes:
+    "foo_idx" btree (i), tablespace "regress_tblspace"
+Tablespace: "regress_tblspace"
+
+\d testschema.foo_idx
+      Index "testschema.foo_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ i      | integer | yes  | i
+btree, for table "testschema.foo"
+Tablespace: "regress_tblspace"
+
+--
+-- partitioned table
+--
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+ERROR:  only shared relations can be placed in pg_global tablespace
+RESET default_tablespace;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+SET default_tablespace TO regress_tblspace;
+CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER TABLE testschema.part SET TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4)
+  TABLESPACE pg_default;
+CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6)
+  PARTITION BY LIST (a);
+ALTER TABLE testschema.part SET TABLESPACE pg_default;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+ERROR:  only shared relations can be placed in pg_global tablespace
+CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10)
+  PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+RESET default_tablespace;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+SELECT relname, spcname FROM pg_catalog.pg_class c
+    JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid)
+    LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
+    where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname;
+ relname  |     spcname      
+----------+------------------
+ part     | 
+ part_1   | 
+ part_2   | regress_tblspace
+ part_3   | regress_tblspace
+ part_4   | 
+ part_56  | regress_tblspace
+ part_78  | 
+ part_910 | regress_tblspace
+(8 rows)
+
+RESET default_tablespace;
+DROP TABLE testschema.part;
+-- partitioned index
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);
+CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
+   relname   |     spcname      
+-------------+------------------
+ part1_a_idx | regress_tblspace
+ part2_a_idx | regress_tblspace
+ part_a_idx  | regress_tblspace
+(3 rows)
+
+\d testschema.part
+        Partitioned table "testschema.part"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: LIST (a)
+Indexes:
+    "part_a_idx" btree (a), tablespace "regress_tblspace"
+Number of partitions: 2 (Use \d+ to list them.)
+
+\d+ testschema.part
+                           Partitioned table "testschema.part"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Indexes:
+    "part_a_idx" btree (a), tablespace "regress_tblspace"
+Partitions: testschema.part1 FOR VALUES IN (1),
+            testschema.part2 FOR VALUES IN (2)
+
+\d testschema.part1
+             Table "testschema.part1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: testschema.part FOR VALUES IN (1)
+Indexes:
+    "part1_a_idx" btree (a), tablespace "regress_tblspace"
+
+\d+ testschema.part1
+                                 Table "testschema.part1"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition of: testschema.part FOR VALUES IN (1)
+Partition constraint: ((a IS NOT NULL) AND (a = 1))
+Indexes:
+    "part1_a_idx" btree (a), tablespace "regress_tblspace"
+
+\d testschema.part_a_idx
+Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.part"
+Number of partitions: 2 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d+ testschema.part_a_idx
+           Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition | Storage | Stats target 
+--------+---------+------+------------+---------+--------------
+ a      | integer | yes  | a          | plain   | 
+btree, for table "testschema.part"
+Partitions: testschema.part1_a_idx,
+            testschema.part2_a_idx
+Tablespace: "regress_tblspace"
+
+-- partitioned rels cannot specify the default tablespace.  These fail:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+SET default_tablespace TO 'pg_default';
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+-- but these work:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+SET default_tablespace TO '';
+CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
+DROP TABLE testschema.dflt, testschema.dflt2;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
+INSERT INTO testschema.test_default_tab VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab (id);
+CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab;
+ id 
+----
+  1
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
+\d testschema.test_index1
+    Index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+    Index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab;
+ id 
+----
+  1
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
+\d testschema.test_index1
+    Index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+    Index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
+\d testschema.test_index1
+   Index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index2
+   Index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+-- (this time with a partitioned table)
+CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint)
+    PARTITION BY LIST (id) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p
+    FOR VALUES IN (1);
+INSERT INTO testschema.test_default_tab_p VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
+CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Number of partitions: 1 (Use \d+ to list them.)
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab_p;
+-- check that default_tablespace affects index additions in ALTER TABLE
+CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace;
+INSERT INTO testschema.test_tab VALUES (1);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id);
+SET default_tablespace TO '';
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_pkey
+   Index "testschema.test_tab_pkey"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_tab"
+
+SELECT * FROM testschema.test_tab;
+ id 
+----
+  1
+(1 row)
+
+DROP TABLE testschema.test_tab;
+-- check that default_tablespace is handled correctly by multi-command
+-- ALTER TABLE that includes a tablespace-preserving rewrite
+CREATE TABLE testschema.test_tab(a int, b int, c int);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a);
+CREATE INDEX test_tab_a_idx ON testschema.test_tab (a);
+SET default_tablespace TO '';
+CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ b      | integer | yes  | b
+btree, for table "testschema.test_tab"
+
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ b      | bigint | yes  | b
+btree, for table "testschema.test_tab"
+
+DROP TABLE testschema.test_tab;
+-- let's try moving a table from one place to another
+CREATE TABLE testschema.atable AS VALUES (1), (2);
+CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
+ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global;
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
+INSERT INTO testschema.atable VALUES(3);	-- ok
+INSERT INTO testschema.atable VALUES(1);	-- fail (checks index)
+ERROR:  duplicate key value violates unique constraint "anindex"
+DETAIL:  Key (column1)=(1) already exists.
+SELECT COUNT(*) FROM testschema.atable;		-- checks heap
+ count 
+-------
+     3
+(1 row)
+
+-- Will fail with bad path
+CREATE TABLESPACE regress_badspace LOCATION '/no/such/location';
+ERROR:  directory "/no/such/location" does not exist
+-- No such tablespace
+CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace;
+ERROR:  tablespace "regress_nosuchspace" does not exist
+-- Fail, in use for some partitioned object
+DROP TABLESPACE regress_tblspace;
+ERROR:  tablespace "regress_tblspace" cannot be dropped because some objects depend on it
+DETAIL:  tablespace for index testschema.part_a_idx
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+-- Fail, not empty
+DROP TABLESPACE regress_tblspace;
+CREATE ROLE regress_tablespace_user1 login;
+CREATE ROLE regress_tablespace_user2 login;
+GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2;
+ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1;
+ERROR:  tablespace "regress_tblspace" does not exist
+CREATE TABLE testschema.tablespace_acl (c int);
+-- new owner lacks permission to create this index from scratch
+CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace;
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2;
+SET SESSION ROLE regress_tablespace_user2;
+CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint;
+REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail
+ERROR:  tablespace "regress_tblspace" does not exist
+RESET ROLE;
+ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed;
+ERROR:  tablespace "regress_tblspace" does not exist
+ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+-- Should show notice that nothing was done
+ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+-- Should succeed
+DROP TABLESPACE regress_tblspace_renamed;
+ERROR:  tablespace "regress_tblspace_renamed" does not exist
+DROP SCHEMA testschema CASCADE;
+NOTICE:  drop cascades to 6 other objects
+DETAIL:  drop cascades to table testschema.foo
+drop cascades to table testschema.asselect
+drop cascades to table testschema.asexecute
+drop cascades to table testschema.part
+drop cascades to table testschema.atable
+drop cascades to table testschema.tablespace_acl
+DROP ROLE regress_tablespace_user1;
+DROP ROLE regress_tablespace_user2;

From 9310468bcdc540d6aa89f7675dd225234aa733b1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 24 Aug 2021 12:37:10 +0300
Subject: [PATCH 036/167] Add test function to flush the shared buffer cache.

---
 .../zenith_test_utils--1.0.sql                |  6 ++
 contrib/zenith_test_utils/zenithtest.c        | 69 +++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
index 6c8fe6521cf..dbf18288fd4 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -6,3 +6,9 @@ RETURNS VOID
 AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
+
+CREATE FUNCTION clear_buffer_cache()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'clear_buffer_cache'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index a7eb278a09b..2d42110cf36 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -12,11 +12,14 @@
 #include "fmgr.h"
 
 #include "access/xact.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
 
 
 PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(clear_buffer_cache);
 
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
@@ -48,3 +51,69 @@ test_consume_xids(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+/*
+ * Flush the buffer cache, evicting all pages that are not currently pinned.
+ */
+Datum
+clear_buffer_cache(PG_FUNCTION_ARGS)
+{
+	bool		save_zenith_test_evict;
+
+	/*
+	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
+	 * buffers, as there is no explicit "evict this buffer" function in the
+	 * buffer manager.
+	 */
+	save_zenith_test_evict = zenith_test_evict;
+	zenith_test_evict = true;
+	PG_TRY();
+	{
+		/* Scan through all the buffers */
+		for (int i = 0; i < NBuffers; i++)
+		{
+			BufferDesc *bufHdr;
+			uint32		buf_state;
+			Buffer		bufferid;
+			bool		isvalid;
+			RelFileNode rnode;
+			ForkNumber	forknum;
+			BlockNumber blocknum;
+
+			/* Peek into the buffer header to see what page it holds. */
+			bufHdr = GetBufferDescriptor(i);
+			buf_state = LockBufHdr(bufHdr);
+
+			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
+				isvalid = true;
+			else
+				isvalid = false;
+			bufferid = BufferDescriptorGetBuffer(bufHdr);
+			rnode = bufHdr->tag.rnode;
+			forknum = bufHdr->tag.forkNum;
+			blocknum = bufHdr->tag.blockNum;
+
+			UnlockBufHdr(bufHdr, buf_state);
+
+			/*
+			 * Pin the buffer, and release it again. Because we have
+			 * zenith_test_evict==true, this will evict the page from
+			 * the buffer cache if no one else is holding a pin on it.
+			 */
+			if (isvalid)
+			{
+				if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
+					ReleaseBuffer(bufferid);
+			}
+		}
+	}
+	PG_FINALLY();
+	{
+		/* restore the GUC */
+		zenith_test_evict = save_zenith_test_evict;
+	}
+	PG_END_TRY();
+
+	PG_RETURN_VOID();
+}

From a4359e6d084bad2d6cc174b0cfc69300e7ad8c96 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 17 Aug 2021 21:16:08 +0300
Subject: [PATCH 037/167] Basic safekeeper refactoring and bug fixing.

On the walproposer side,

- Change the voting flow so that acceptor tells his epoch along with giving
  the vote, not before it; otherwise it might get immediately stale. #294
- Adjust to using separate structs for disk and network.

ref #315
---
 .../libpqwalproposer/libpqwalproposer.c       |   2 +
 src/backend/replication/walproposer.c         | 222 ++++++++++--------
 src/backend/replication/walproposer_utils.c   |  15 +-
 src/include/replication/walproposer.h         | 133 ++++++-----
 4 files changed, 199 insertions(+), 173 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index a5d7fec1a33..63c90f5a54b 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -1,3 +1,5 @@
+#include "postgres.h"
+
 #include "replication/walproposer.h"
 #include "libpq-fe.h"
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 102ce033949..0137cc67b8e 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -4,8 +4,11 @@
  *
  * Broadcast WAL stream to Zenith WAL acceptetors
  */
+#include "postgres.h"
+
 #include <signal.h>
 #include <unistd.h>
+#include "access/xlogdefs.h"
 #include "replication/walproposer.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
@@ -39,12 +42,15 @@ static WalMessage*  msgQueueHead;
 static WalMessage*  msgQueueTail;
 static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
 static XLogRecPtr	lastSentVCLLsn;	/* VCL replies have been sent to walkeeper up to here */
-static ServerInfo   serverInfo;
+static ProposerGreeting   proposerGreeting;
 static WaitEventSet* waitEvents;
-static WalKeeperResponse lastFeedback;
+static AppendResponse lastFeedback;
 static XLogRecPtr   restartLsn; /* Last position received by all walkeepers. */
-static RequestVote  prop;       /* Vote request for walkeeper */
-static int          leader;     /* Most advanced walkeeper */
+static VoteRequest voteRequest; /* Vote request for walkeeper */
+static term_t       propTerm; /* term of the proposer */
+static XLogRecPtr   propVcl;    /* VCL of the proposer */
+static term_t		donorEpoch; /* Most advanced acceptor epoch */
+static int          donor;     /* Most advanced acceptor */
 static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
@@ -187,6 +193,7 @@ ShutdownConnection(int i, bool remove_event)
 	walkeeper[i].state = SS_OFFLINE;
 	walkeeper[i].pollState = SPOLL_NONE;
 	walkeeper[i].sockWaitState = WANTS_NO_WAIT;
+	walkeeper[i].currMsg = NULL;
 
 	if (remove_event)
 		HackyRemoveWalProposerEvent(i);
@@ -281,8 +288,14 @@ GetAcknowledgedByQuorumWALPosition(void)
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		responses[i] = walkeeper[i].feedback.epoch == prop.epoch
-			? walkeeper[i].feedback.flushLsn : prop.VCL;
+		/*
+		 * Note that while we haven't pushed WAL up to VCL to the majority we
+		 * don't really know which LSN is reliably committed as reported
+		 * flush_lsn is physical end of wal, which can contain diverged
+		 * history (compared to donor).
+		 */
+		responses[i] = walkeeper[i].feedback.epoch == propTerm
+			? walkeeper[i].feedback.flushLsn : 0;
 	}
 	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
 
@@ -302,6 +315,7 @@ HandleWalKeeperResponse(void)
 	if (minQuorumLsn > lastFeedback.flushLsn)
 	{
 		lastFeedback.flushLsn = minQuorumLsn;
+		/* advance the replication slot */
 		ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
 	}
 	CombineHotStanbyFeedbacks(&hsFeedback);
@@ -326,7 +340,7 @@ HandleWalKeeperResponse(void)
 			Assert(restartLsn < msg->req.endLsn);
 			restartLsn = msg->req.endLsn;
 		}
-		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(WalKeeperRequest));
+		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
 	if (!msgQueueHead) /* queue is empty */
@@ -395,26 +409,24 @@ WalProposerMain(Datum main_arg)
 
 	GetXLogReplayRecPtr(&ThisTimeLineID);
 
-	/* Fill information about server */
-	serverInfo.timeline = ThisTimeLineID;
-	serverInfo.walEnd = GetFlushRecPtr();
-	serverInfo.walSegSize = wal_segment_size;
-	serverInfo.pgVersion = PG_VERSION_NUM;
+	/* Fill the greeting package */
+	proposerGreeting.tag = 'g';
+	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
+	proposerGreeting.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
+	proposerGreeting.systemId = GetSystemIdentifier();
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
-	 !HexDecodeString(serverInfo.ztimelineid, zenith_timeline_walproposer, 16))
+	 !HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
-
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
-	 !HexDecodeString(serverInfo.ztenantid, zenith_tenant_walproposer, 16))
+	 !HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
-
-	serverInfo.protocolVersion = SK_PROTOCOL_VERSION;
-	pg_strong_random(&serverInfo.nodeId.uuid, sizeof(serverInfo.nodeId.uuid));
-	serverInfo.systemId = GetSystemIdentifier();
+	proposerGreeting.timeline = ThisTimeLineID;
+	proposerGreeting.walSegSize = wal_segment_size;
 
 	last_reconnect_attempt = GetCurrentTimestamp();
 
@@ -448,7 +460,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	elog(LOG, "WAL proposer starts streaming at %X/%X",
 		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = serverInfo.timeline;
+	cmd.timeline = proposerGreeting.timeline;
 	cmd.startpoint = startpos;
 	StartReplication(&cmd);
 }
@@ -461,15 +473,17 @@ SendMessageToNode(int i, WalMessage* msg)
 {
 	WalKeeper* wk = &walkeeper[i];
 
-	/* If there is no pending message then send new one */
-	if (wk->currMsg == NULL)
-	{
-		/* Skip already acknowledged messages */
-		while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
-			msg = msg->next;
+	/* we shouldn't be already sending something */
+	Assert(wk->currMsg == NULL);
+	/*
+	 * Skip already acknowledged messages. Used during start to get to the
+	 * first not yet received message. Otherwise we always just send
+	 * 'msg'.
+	 */
+	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
+		msg = msg->next;
 
-		wk->currMsg = msg;
-	}
+	wk->currMsg = msg;
 
 	/* Only try to send the message if it's non-null */
 	if (wk->currMsg)
@@ -530,12 +544,15 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 		msgQueueHead = msg;
 	msgQueueTail = msg;
 
-	msg->size = sizeof(WalKeeperRequest) + len;
+	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
 	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.vcl = propVcl;
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
-	msg->req.senderId = prop.nodeId;
+	msg->req.proposerId = proposerGreeting.proposerId;
 	memcpy(&msg->req+1, data + XLOG_HDR_SIZE, len);
 
 	Assert(msg->req.endLsn >= lastSentLsn);
@@ -574,64 +591,56 @@ CreateMessageVCLOnly(void)
 		msgQueueHead = msg;
 	msgQueueTail = msg;
 
-	msg->size = sizeof(WalKeeperRequest);
+	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
 	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.vcl = propVcl;
 	msg->req.beginLsn = lastSentLsn;
 	msg->req.endLsn = lastSentLsn;
-	msg->req.senderId = prop.nodeId;
+	msg->req.proposerId = proposerGreeting.proposerId;
 	/* restartLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
 	return msg;
 }
 
 
 /*
- * Prepare vote request for election
+ * Called after majority of acceptors gave votes, it calculates the most
+ * advanced safekeeper (who will be the donor) and VCL -- LSN since which we'll
+ * write WAL in our term.
+ * Sets restartLsn along the way (though it is not of much use at this point).
  */
 static void
-StartElection(void)
+DetermineVCL(void)
 {
 	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
-	XLogRecPtr initWALPos = serverInfo.walSegSize;
-	prop.VCL = restartLsn = initWALPos;
-	prop.nodeId = serverInfo.nodeId;
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].state == SS_VOTING)
-		{
-			prop.nodeId.term = Max(walkeeper[i].info.server.nodeId.term, prop.nodeId.term);
-			restartLsn = Max(walkeeper[i].info.restartLsn, restartLsn);
-			if (walkeeper[i].info.epoch > prop.epoch
-				|| (walkeeper[i].info.epoch == prop.epoch && walkeeper[i].info.flushLsn > prop.VCL))
+	propVcl = wal_segment_size;
+	donorEpoch = 0;
+	restartLsn = wal_segment_size;
 
-			{
-				prop.epoch = walkeeper[i].info.epoch;
-				prop.VCL = walkeeper[i].info.flushLsn;
-				leader = i;
-			}
-		}
-	}
-	/* Only walkeepers from most recent epoch can report it's FlushLsn to master */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_VOTING)
+		if (walkeeper[i].state == SS_IDLE)
 		{
-			if (walkeeper[i].info.epoch == prop.epoch)
+			if (walkeeper[i].voteResponse.epoch > donorEpoch ||
+				(walkeeper[i].voteResponse.epoch == donorEpoch &&
+				 walkeeper[i].voteResponse.flushLsn > propVcl))
 			{
-				walkeeper[i].feedback.flushLsn = walkeeper[i].info.flushLsn;
-			}
-			else
-			{
-				elog(WARNING, "WalKeeper %s:%s belongs to old epoch " INT64_FORMAT " while current epoch is " INT64_FORMAT,
-					walkeeper[i].host,
-					walkeeper[i].port,
-					walkeeper[i].info.epoch,
-					prop.epoch);
+				donorEpoch = walkeeper[i].voteResponse.epoch;
+				propVcl = walkeeper[i].voteResponse.flushLsn;
+				donor = i;
 			}
+			restartLsn = Max(walkeeper[i].voteResponse.restartLsn, restartLsn);
 		}
 	}
-	prop.nodeId.term += 1;
-	prop.epoch += 1;
+
+	elog(LOG, "got votes from majority (%d) of nodes, VCL %X/%X, donor %s:%s, restart_lsn %X/%X",
+		 quorum,
+		 LSN_FORMAT_ARGS(propVcl),
+		 walkeeper[donor].host, walkeeper[donor].port,
+		 LSN_FORMAT_ARGS(restartLsn)
+		);
 }
 
 /*
@@ -675,7 +684,7 @@ ReconnectWalKeepers(void)
  * Receive WAL from most advanced WAL keeper
  */
 static bool
-WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
 {
 	char conninfo[MAXCONNINFO];
 	char *err;
@@ -683,18 +692,18 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 	WalRcvStreamOptions options;
 
 	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
-			walkeeper[leader].host, walkeeper[leader].port, zenith_timeline_walproposer);
+			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
 				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						walkeeper[leader].host, walkeeper[leader].port,
+						walkeeper[donor].host, walkeeper[donor].port,
 						err)));
 		return false;
 	}
 	elog(LOG, "Start recovery from %s:%s starting from %X/%08X till %X/%08X timeline %d",
-		 walkeeper[leader].host, walkeeper[leader].port,
+		 walkeeper[donor].host, walkeeper[donor].port,
 		 (uint32)(startpos>>32), (uint32)startpos, (uint32)(endpos >> 32), (uint32)endpos,
 		 timeline);
 
@@ -736,7 +745,7 @@ WalProposerRecovery(int leader, TimeLineID timeline, XLogRecPtr startpos, XLogRe
 		{
 			for (WalMessage* msg = msgQueueHead; msg != NULL; msg = msg->next)
 			{
-				if (msg->req.endLsn <= walkeeper[i].info.flushLsn)
+				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
 				{
 					msg->ackMask |= 1 << i; /* message is already received by this walkeeper */
 				}
@@ -1142,7 +1151,7 @@ AdvancePollState(int i, uint32 events)
 				/* Note: This state corresponds to the process of sending the relevant information
 				 * along. The moment we finish sending, we use SS_HANDSHAKE_RECV to complete the
 				 * handshake. */
-				switch (walprop_async_write(wk->conn, &serverInfo, sizeof(serverInfo)))
+				switch (walprop_async_write(wk->conn, &proposerGreeting, sizeof(proposerGreeting)))
 				{
 					case PG_ASYNC_WRITE_SUCCESS:
 						/* If the write immediately succeeds, we can move on to the next state. */
@@ -1183,24 +1192,20 @@ AdvancePollState(int i, uint32 events)
 			case SS_HANDSHAKE_RECV:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->info, sizeof(wk->info)))
+				if (!ReadPGAsyncIntoValue(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
-				/* Check protocol version */
-				if (wk->info.server.protocolVersion != SK_PROTOCOL_VERSION)
-				{
-					elog(WARNING, "WalKeeper has incompatible protocol version %d vs. %d",
-							wk->info.server.protocolVersion, SK_PROTOCOL_VERSION);
-					ResetConnection(i);
-					return;
-				}
-
-				/* Protocol is all good, move to voting */
 				wk->state     = SS_VOTING;
 				wk->pollState = SPOLL_IDLE;
 				wk->feedback.flushLsn = restartLsn;
 				wk->feedback.hs.ts = 0;
 
+				/*
+				 * We want our term to be highest and unique, so choose max
+				 * and +1 once we have majority.
+				 */
+				propTerm = Max(walkeeper[i].greet.term, propTerm);
+
 				/* Check if we have quorum. If there aren't enough walkeepers, wait and do nothing.
 				 * We'll eventually get a task when the election starts.
 				 *
@@ -1208,9 +1213,17 @@ AdvancePollState(int i, uint32 events)
 				if (++n_connected >= quorum)
 				{
 					if (n_connected == quorum)
-						StartElection();
+					{
+						propTerm++;
+						/* prepare voting message */
+						voteRequest = (VoteRequest) {
+							.tag = 'v',
+							.term = propTerm
+						};
+						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+					}
 
-					/* Now send max-node-id to everyone participating in voting and wait their responses */
+					/* Now send voting request to the cohort and wait responses */
 					for (int j = 0; j < n_walkeepers; j++)
 					{
 						/* Remember: SS_VOTING indicates that the walkeeper is participating in
@@ -1240,7 +1253,7 @@ AdvancePollState(int i, uint32 events)
 
 			/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
-				switch (walprop_async_write(wk->conn, &prop, sizeof(prop)))
+				switch (walprop_async_write(wk->conn, &voteRequest, sizeof(voteRequest)))
 				{
 					case PG_ASYNC_WRITE_SUCCESS:
 						/* If the write immediately succeeds, we can move on to the next state. */
@@ -1278,16 +1291,24 @@ AdvancePollState(int i, uint32 events)
 			case SS_WAIT_VERDICT:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->info.server.nodeId, sizeof(wk->info.server.nodeId)))
+				if (!ReadPGAsyncIntoValue(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
-				/* If server accept our candidate, then it returns it in response */
-				if (CompareNodeId(&wk->info.server.nodeId, &prop.nodeId) != 0)
+
+				/*
+				 * In case of acceptor rejecting our vote, bail out, but only if
+				 * either it already lives in strictly higher term (concurrent
+				 * compute spotted) or we are not elected yet and thus need the
+				 * vote.
+				 */
+				if ((!wk->voteResponse.voteGiven) &&
+					(wk->voteResponse.term > propTerm || n_votes < quorum))
 				{
-					elog(FATAL, "WalKeeper %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
 						wk->host, wk->port,
-						wk->info.server.nodeId.term, prop.nodeId.term);
+						wk->voteResponse.term, propTerm);
 				}
+				Assert(wk->voteResponse.term == propTerm);
 
 				/* Handshake completed, do we have quorum? */
 				wk->state         = SS_IDLE;
@@ -1296,21 +1317,18 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes == quorum)
 				{
-					elog(LOG, "Successfully established connection with %d nodes, VCL %X/%X",
-						 quorum,
-						 (uint32) (prop.VCL >> 32), (uint32) (prop.VCL)
-						);
+					DetermineVCL();
 
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-					if (restartLsn < prop.VCL)
+					if (restartLsn < propVcl)
 					{
-						elog(LOG, "Start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
-							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(prop.VCL));
+						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
+							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(propVcl));
 						/* Perform recovery */
-						if (!WalProposerRecovery(leader, serverInfo.timeline, restartLsn, prop.VCL))
+						if (!WalProposerRecovery(donor, proposerGreeting.timeline, restartLsn, propVcl))
 							elog(FATAL, "Failed to recover state");
 					}
-					WalProposerStartStreaming(prop.VCL);
+					WalProposerStartStreaming(propVcl);
 					/* Should not return here */
 				}
 				else
@@ -1331,7 +1349,7 @@ AdvancePollState(int i, uint32 events)
 				if (wk->pollState != SPOLL_RETRY)
 				{
 					elog(LOG, "Sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %s:%s",
-						 msg->size - sizeof(WalKeeperRequest),
+						 msg->size - sizeof(AppendRequestHeader),
 						 LSN_FORMAT_ARGS(msg->req.commitLsn),
 						 LSN_FORMAT_ARGS(restartLsn),
 						 wk->host, wk->port);
@@ -1464,7 +1482,7 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
 			wk->host, wk->port,
 			FormatWalKeeperState(wk->state),
-			sizeof(wk->info.server.nodeId), buf_size);
+			value_size, buf_size);
 	}
 
 	/* Copy the resulting info into place */
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 722fa66d5e6..29c209e63c1 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -1,3 +1,5 @@
+#include "postgres.h"
+
 #include "replication/walproposer.h"
 #include "common/logging.h"
 #include "common/ip.h"
@@ -5,16 +7,6 @@
 #include <netinet/tcp.h>
 #include <unistd.h>
 
-int CompareNodeId(NodeId* id1, NodeId* id2)
-{
-	return
-		(id1->term < id2->term)
-		? -1
-		: (id1->term > id2->term)
-		   ? 1
-   		   : memcmp(&id1->uuid, &id1->uuid, sizeof(pg_uuid_t));
-}
-
 int
 CompareLsn(const void *a, const void *b)
 {
@@ -30,7 +22,7 @@ CompareLsn(const void *a, const void *b)
 }
 
 /* Converts a `WKSockWaitKind` into the bit flags that would match it
- * 
+ *
  * Note: For `wait_kind = WANTS_NO_WAIT`, this will return a value of zero,
  * which does not match any events. Attempting to wait on no events will
  * always timeout, so it's best to double-check the value being provided to
@@ -231,4 +223,3 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 
 	return true;
 }
-
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index b7b35e876e5..3f03f43eb2a 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -1,6 +1,7 @@
 #ifndef __WALKEEPER_H__
 #define __WALKEEPER_H__
 
+#include "access/xlogdefs.h"
 #include "postgres.h"
 #include "port.h"
 #include "access/xlog_internal.h"
@@ -172,7 +173,7 @@ typedef enum
 } WalKeeperState;
 
 /* WAL safekeeper state - individual level
- * 
+ *
  * This type encompasses the type of polling necessary to move on to the
  * next `WalKeeperState` from the current. It's things like "we need to
  * call PQflush some more", or "retry the current operation".
@@ -251,77 +252,75 @@ typedef enum
 	WANTS_SOCK_EITHER,
 } WKSockWaitKind;
 
-/*
- * Unique node identifier used by Paxos
- */
-typedef struct NodeId
-{
-	uint64     term;
-	pg_uuid_t  uuid;
-} NodeId;
+/* Consensus logical timestamp. */
+typedef uint64 term_t;
 
 /*
- * Information about Postgres server broadcasted by WAL proposer to walkeeper
+ * Proposer -> Acceptor messaging.
  */
-typedef struct ServerInfo
+
+/* Initial Proposer -> Acceptor message */
+typedef struct ProposerGreeting
 {
-	uint32     protocolVersion;   /* proposer-walkeeper protocol version */
-	uint32     pgVersion;         /* Postgres server version */
-	NodeId     nodeId;
-	uint64     systemId;          /* Postgres system identifier */
-	uint8	   ztimelineid[16];   /* Zenith timeline id */
-	XLogRecPtr walEnd;
+	uint64	   tag;				  /* message tag */
+	uint32	   protocolVersion;	  /* proposer-walkeeper protocol version */
+	uint32	   pgVersion;
+	pg_uuid_t  proposerId;
+	uint64	   systemId;		  /* Postgres system identifier */
+	uint8	   ztimelineid[16];	  /* Zenith timeline id */
+	uint8	   ztenantid[16];
 	TimeLineID timeline;
-	int        walSegSize;
-	uint8      ztenantid[16];
-} ServerInfo;
+	uint32	   walSegSize;
+} ProposerGreeting;
 
 /*
- * Vote request sent from proposer to walkeepers
+ * Acceptor -> Proposer initial response: the highest term acceptor voted for.
  */
-typedef struct RequestVote
+typedef struct AcceptorGreeting
 {
-	NodeId     nodeId;
-	XLogRecPtr VCL;   /* volume commit LSN */
-	uint64     epoch; /* new epoch when walkeeper reaches VCL */
-} RequestVote;
+	uint64		tag;
+	term_t		term;
+} AcceptorGreeting;
 
 /*
- * Information of about storage node
+ * Proposer -> Acceptor vote request.
  */
-typedef struct WalKeeperInfo
+typedef struct VoteRequest
 {
-	uint32     magic;             /* magic for verifying content the control file */
-	uint32     formatVersion;     /* walkeeper format version */
-	uint64     epoch;             /* walkeeper's epoch */
-	ServerInfo server;
-	XLogRecPtr commitLsn;         /* part of WAL acknowledged by quorum */
-	XLogRecPtr flushLsn;          /* locally flushed part of WAL */
-	XLogRecPtr restartLsn;        /* minimal LSN which may be needed for recovery of some walkeeper: min(commitLsn) for all walkeepers */
-} WalKeeperInfo;
-
-/*
- * Hot standby feedback received from replica
- */
-typedef struct HotStandbyFeedback
-{
-	TimestampTz       ts;
-	FullTransactionId xmin;
-	FullTransactionId catalog_xmin;
-} HotStandbyFeedback;
-
+	uint64		tag;
+	term_t		term;
+	pg_uuid_t   proposerId; /* for monitoring/debugging */
+} VoteRequest;
+
+/* Vote itself, sent from safekeeper to proposer */
+typedef struct VoteResponse {
+	uint64 tag;
+	term_t term; /* not really needed, just adds observability */
+	uint64 voteGiven;
+    /// Safekeeper's log position, to let proposer choose the most advanced one
+	term_t epoch;
+	XLogRecPtr flushLsn;
+	XLogRecPtr restartLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+} VoteResponse;
 
 /*
- * Request with WAL message sent from proposer to walkeeper.
+ * Header of request with WAL message sent from proposer to walkeeper.
  */
-typedef struct WalKeeperRequest
+typedef struct AppendRequestHeader
 {
-	NodeId     senderId;    /* Sender's node identifier (looks like we do not need it for TCP streaming connection) */
+	uint64 tag;
+	term_t term; /* term of the proposer */
+	/*
+	 * LSN since which current proposer appends WAL; determines epoch switch
+	 * point.
+	 */
+	XLogRecPtr vcl;
 	XLogRecPtr beginLsn;    /* start position of message in WAL */
 	XLogRecPtr endLsn;      /* end position of message in WAL */
-	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
 	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
-} WalKeeperRequest;
+	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
+	pg_uuid_t  proposerId; /* for monitoring/debugging */
+} AppendRequestHeader;
 
 /*
  * All copy data message ('w') are linked in L1 send list and asynchronously sent to receivers.
@@ -332,7 +331,7 @@ struct WalMessage
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
 	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
-	WalKeeperRequest req; /* request to walkeeper (message header) */
+	AppendRequestHeader req; /* request to walkeeper (message header) */
 
 	/* PHANTOM FIELD:
 	 *
@@ -341,15 +340,31 @@ struct WalMessage
 	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
 };
 
+/*
+ * Hot standby feedback received from replica
+ */
+typedef struct HotStandbyFeedback
+{
+	TimestampTz       ts;
+	FullTransactionId xmin;
+	FullTransactionId catalog_xmin;
+} HotStandbyFeedback;
+
 /*
  * Report walkeeper state to proposer
  */
-typedef struct WalKeeperResponse
+typedef struct AppendResponse
 {
-	uint64     epoch;
+	/*
+	 * Current term of the safekeeper; if it is higher than proposer's, the
+	 * compute is out of date.
+	 */
+	uint64 tag;
+	term_t     term;
+	term_t     epoch;
 	XLogRecPtr flushLsn;
 	HotStandbyFeedback hs;
-} WalKeeperResponse;
+} AppendResponse;
 
 
 /*
@@ -369,12 +384,12 @@ typedef struct WalKeeper
 	WalKeeperPollState pollState;     /* what kind of polling is necessary to advance `state` */
 	WKSockWaitKind     sockWaitState; /* what state are we expecting the socket to be in for
 									     the polling required? */
-	WalKeeperInfo      info;          /* walkeeper info */
-	WalKeeperResponse  feedback;      /* feedback to master */
+	AcceptorGreeting   greet;         /* acceptor greeting  */
+	VoteResponse	   voteResponse;  /* the vote */
+	AppendResponse  feedback;      /* feedback to master */
 } WalKeeper;
 
 
-int        CompareNodeId(NodeId* id1, NodeId* id2);
 int        CompareLsn(const void *a, const void *b);
 uint32     WaitKindAsEvents(WKSockWaitKind wait_kind);
 char*      FormatWalKeeperState(WalKeeperState state);

From 3287dbbaafbc7b5d2b5e45f46b3068d3680005e1 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 27 Aug 2021 13:33:53 +0300
Subject: [PATCH 038/167] Rename VCL to epochStartLsn and restart_lsn to
 truncate_lsn.

epochStartLsn is the LSN since which new proposer writes its WAL in its epoch,
let's be more explicit here.

In several places it also actually meant something we call *commit_lsn* -- the
latest lsn known to be reliably commited (it constantly moves within one wal
proposer).

truncate_lsn is LSN still needed by the most lagging safekeeper. restart_lsn is
terminology from pg_replicaton_slots, but here we don't really have 'restart';
hopefully truncate word makes it clearer.
---
 src/backend/replication/walproposer.c | 83 ++++++++++++++-------------
 src/include/replication/walproposer.h | 16 ++++--
 2 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 0137cc67b8e..8e46f52b15f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -41,14 +41,14 @@ static WalKeeper    walkeeper[MAX_WALKEEPERS];
 static WalMessage*  msgQueueHead;
 static WalMessage*  msgQueueTail;
 static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
-static XLogRecPtr	lastSentVCLLsn;	/* VCL replies have been sent to walkeeper up to here */
+static XLogRecPtr	lastSentCommitLsn;	/* last commitLsn broadcast to walkeepers */
 static ProposerGreeting   proposerGreeting;
 static WaitEventSet* waitEvents;
 static AppendResponse lastFeedback;
-static XLogRecPtr   restartLsn; /* Last position received by all walkeepers. */
+static XLogRecPtr   truncateLsn; /* Last position received by all walkeepers. */
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t       propTerm; /* term of the proposer */
-static XLogRecPtr   propVcl;    /* VCL of the proposer */
+static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
 static term_t		donorEpoch; /* Most advanced acceptor epoch */
 static int          donor;     /* Most advanced acceptor */
 static int          n_votes = 0;
@@ -289,7 +289,7 @@ GetAcknowledgedByQuorumWALPosition(void)
 	for (int i = 0; i < n_walkeepers; i++)
 	{
 		/*
-		 * Note that while we haven't pushed WAL up to VCL to the majority we
+		 * Note that while we haven't pushed WAL up to epoch start lsn to the majority we
 		 * don't really know which LSN is reliably committed as reported
 		 * flush_lsn is physical end of wal, which can contain diverged
 		 * history (compared to donor).
@@ -335,10 +335,10 @@ HandleWalKeeperResponse(void)
 	{
 		WalMessage* msg = msgQueueHead;
 		msgQueueHead = msg->next;
-		if (restartLsn < msg->req.beginLsn)
+		if (truncateLsn < msg->req.beginLsn)
 		{
-			Assert(restartLsn < msg->req.endLsn);
-			restartLsn = msg->req.endLsn;
+			Assert(truncateLsn < msg->req.endLsn);
+			truncateLsn = msg->req.endLsn;
 		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
@@ -488,7 +488,7 @@ SendMessageToNode(int i, WalMessage* msg)
 	/* Only try to send the message if it's non-null */
 	if (wk->currMsg)
 	{
-		wk->currMsg->req.restartLsn = restartLsn;
+		wk->currMsg->req.truncateLsn = truncateLsn;
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
 		/* Once we've selected and set up our message, actually start sending it. */
@@ -549,7 +549,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
-	msg->req.vcl = propVcl;
+	msg->req.epochStartLsn = propEpochStartLsn;
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
 	msg->req.proposerId = proposerGreeting.proposerId;
@@ -570,10 +570,10 @@ WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
 
 /*
  * Create WAL message with no data, just to let the walkeepers
- * know that the VCL has advanced.
+ * know that commit lsn has advanced.
  */
 static WalMessage*
-CreateMessageVCLOnly(void)
+CreateMessageCommitLsnOnly(void)
 {
 	/* Create new message and append it to message queue */
 	WalMessage*	msg;
@@ -596,28 +596,29 @@ CreateMessageVCLOnly(void)
 	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
-	msg->req.vcl = propVcl;
+	msg->req.epochStartLsn = propEpochStartLsn;
 	msg->req.beginLsn = lastSentLsn;
 	msg->req.endLsn = lastSentLsn;
 	msg->req.proposerId = proposerGreeting.proposerId;
-	/* restartLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
+	/* truncateLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
 	return msg;
 }
 
 
 /*
  * Called after majority of acceptors gave votes, it calculates the most
- * advanced safekeeper (who will be the donor) and VCL -- LSN since which we'll
- * write WAL in our term.
- * Sets restartLsn along the way (though it is not of much use at this point).
+ * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
+ * which we'll write WAL in our term.
+ * Sets truncateLsn along the way (though it
+ * is not of much use at this point).
  */
 static void
-DetermineVCL(void)
+DetermineEpochStartLsn(void)
 {
 	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
-	propVcl = wal_segment_size;
+	propEpochStartLsn = wal_segment_size;
 	donorEpoch = 0;
-	restartLsn = wal_segment_size;
+	truncateLsn = wal_segment_size;
 
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -625,21 +626,21 @@ DetermineVCL(void)
 		{
 			if (walkeeper[i].voteResponse.epoch > donorEpoch ||
 				(walkeeper[i].voteResponse.epoch == donorEpoch &&
-				 walkeeper[i].voteResponse.flushLsn > propVcl))
+				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
 			{
 				donorEpoch = walkeeper[i].voteResponse.epoch;
-				propVcl = walkeeper[i].voteResponse.flushLsn;
+				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
 				donor = i;
 			}
-			restartLsn = Max(walkeeper[i].voteResponse.restartLsn, restartLsn);
+			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
 		}
 	}
 
-	elog(LOG, "got votes from majority (%d) of nodes, VCL %X/%X, donor %s:%s, restart_lsn %X/%X",
+	elog(LOG, "got votes from majority (%d) of nodes, epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
 		 quorum,
-		 LSN_FORMAT_ARGS(propVcl),
+		 LSN_FORMAT_ARGS(propEpochStartLsn),
 		 walkeeper[donor].host, walkeeper[donor].port,
-		 LSN_FORMAT_ARGS(restartLsn)
+		 LSN_FORMAT_ARGS(truncateLsn)
 		);
 }
 
@@ -1197,7 +1198,7 @@ AdvancePollState(int i, uint32 events)
 
 				wk->state     = SS_VOTING;
 				wk->pollState = SPOLL_IDLE;
-				wk->feedback.flushLsn = restartLsn;
+				wk->feedback.flushLsn = truncateLsn;
 				wk->feedback.hs.ts = 0;
 
 				/*
@@ -1317,18 +1318,18 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes == quorum)
 				{
-					DetermineVCL();
+					DetermineEpochStartLsn();
 
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
-					if (restartLsn < propVcl)
+					if (truncateLsn < propEpochStartLsn)
 					{
-						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to VCL=%X/%X",
-							 LSN_FORMAT_ARGS(restartLsn), LSN_FORMAT_ARGS(propVcl));
+						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to epochStartLsn=%X/%X",
+							 LSN_FORMAT_ARGS(truncateLsn), LSN_FORMAT_ARGS(propEpochStartLsn));
 						/* Perform recovery */
-						if (!WalProposerRecovery(donor, proposerGreeting.timeline, restartLsn, propVcl))
+						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
 					}
-					WalProposerStartStreaming(propVcl);
+					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
 				}
 				else
@@ -1348,10 +1349,10 @@ AdvancePollState(int i, uint32 events)
 				/* Don't repeat logs if we have to retry the actual send operation itself */
 				if (wk->pollState != SPOLL_RETRY)
 				{
-					elog(LOG, "Sending message with len %ld VCL=%X/%X restart LSN=%X/%X to %s:%s",
+					elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
 						 msg->size - sizeof(AppendRequestHeader),
 						 LSN_FORMAT_ARGS(msg->req.commitLsn),
-						 LSN_FORMAT_ARGS(restartLsn),
+						 LSN_FORMAT_ARGS(truncateLsn),
 						 wk->host, wk->port);
 				}
 
@@ -1386,7 +1387,7 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* next;
 				XLogRecPtr  minQuorumLsn;
-				WalMessage* vclUpdateMsg;
+				WalMessage* commitLsnUpdateMsg;
 
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
@@ -1407,19 +1408,19 @@ AdvancePollState(int i, uint32 events)
 				SendMessageToNode(i, next);
 
 				/*
-				 * Also send the new VCL to all the walkeepers.
+				 * Also send the new commit lsn to all the walkeepers.
 				 *
 				 * FIXME: This is redundant for walkeepers that have other outbound messages
 				 * pending.
 				 */
 				minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 
-				if (minQuorumLsn > lastSentVCLLsn)
+				if (minQuorumLsn > lastSentCommitLsn)
 				{
-					vclUpdateMsg = CreateMessageVCLOnly();
-					if (vclUpdateMsg)
-						BroadcastMessage(vclUpdateMsg);
-					lastSentVCLLsn = minQuorumLsn;
+					commitLsnUpdateMsg = CreateMessageCommitLsnOnly();
+					if (commitLsnUpdateMsg)
+						BroadcastMessage(commitLsnUpdateMsg);
+					lastSentCommitLsn = minQuorumLsn;
 				}
 				break;
 			}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 3f03f43eb2a..af4d877963d 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -300,7 +300,7 @@ typedef struct VoteResponse {
     /// Safekeeper's log position, to let proposer choose the most advanced one
 	term_t epoch;
 	XLogRecPtr flushLsn;
-	XLogRecPtr restartLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
 } VoteResponse;
 
 /*
@@ -311,15 +311,19 @@ typedef struct AppendRequestHeader
 	uint64 tag;
 	term_t term; /* term of the proposer */
 	/*
-	 * LSN since which current proposer appends WAL; determines epoch switch
-	 * point.
+	 * LSN since which current proposer appends WAL (begin_lsn of its first
+	 * record); determines epoch switch point.
 	 */
-	XLogRecPtr vcl;
+	XLogRecPtr epochStartLsn;
 	XLogRecPtr beginLsn;    /* start position of message in WAL */
 	XLogRecPtr endLsn;      /* end position of message in WAL */
 	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
-	XLogRecPtr restartLsn;  /* restart LSN position  (minimal LSN which may be needed by proposer to perform recovery) */
-	pg_uuid_t  proposerId; /* for monitoring/debugging */
+	/*
+	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
+	 *  + 1 of last record streamed to everyone)
+	 */
+    XLogRecPtr truncateLsn;
+    pg_uuid_t  proposerId; /* for monitoring/debugging */
 } AppendRequestHeader;
 
 /*

From 09a8680a381cc04b88aa28890118ca0a209e5878 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 3 Aug 2021 19:16:54 +0300
Subject: [PATCH 039/167] [refer #27] Implement shared relsize cache to improve
 zenith performance.

Cache relfilenode size returned by zenith_nblocks() and also update it when relation is extended.
Don't update it from zenith_write() or zenith_wallog_page(), since there is no guarantee that these functions wouldn't be called for some page that is not the last one

It can be configured with zenith.relsize_hash_size GUC parameter.
Set it to 0 to disable caching.
---
 contrib/zenith/Makefile           |   2 +-
 contrib/zenith/libpagestore.c     |   2 +
 contrib/zenith/pagestore_client.h |   7 ++
 contrib/zenith/pagestore_smgr.c   |  10 +-
 contrib/zenith/relsize_cache.c    | 150 ++++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 contrib/zenith/relsize_cache.c

diff --git a/contrib/zenith/Makefile b/contrib/zenith/Makefile
index ad41c55bd71..4b706186fff 100644
--- a/contrib/zenith/Makefile
+++ b/contrib/zenith/Makefile
@@ -4,7 +4,7 @@
 MODULE_big = zenith
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o
+	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 142999a6a8e..b726cee80f8 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -326,6 +326,8 @@ _PG_init(void)
 							 0,
 							 NULL, NULL, NULL);
 
+	relsize_hash_init();
+
 	if (page_server != NULL)
 		zenith_log(ERROR, "libpqpagestore already loaded");
 
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index b4b223d3c46..dbcaa5fdb91 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -149,4 +149,11 @@ extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber nblocks);
 extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
 
+
+/* utils for zenith relsize cache */
+extern void relsize_hash_init(void);
+extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
+extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+
 #endif
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 47a37b0687d..5db79710d68 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -25,6 +25,7 @@
 #include "storage/bufmgr.h"
 #include "fmgr.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "replication/walsender.h"
 #include "catalog/pg_tablespace_d.h"
 
@@ -565,6 +566,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	XLogRecPtr lsn;
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno+1);
 
 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -871,9 +873,12 @@ BlockNumber
 zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	ZenithResponse *resp;
-	int			n_blocks;
+	BlockNumber n_blocks;
 	XLogRecPtr request_lsn;
 
+	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
+		return n_blocks;
+
 	request_lsn = zenith_get_request_lsn(false);
 	resp = page_server->request((ZenithRequest) {
 		.tag = T_ZenithNblocksRequest,
@@ -884,6 +889,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		.lsn = request_lsn
 	});
 	n_blocks = resp->n_blocks;
+	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
 
 	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 		 reln->smgr_rnode.node.spcNode,
@@ -905,6 +911,8 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 	XLogRecPtr lsn;
 
+	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
+
 	/*
 	 * Truncating a relation drops all its buffers from the buffer cache without
 	 * calling smgrwrite() on them. But we must account for that in our tracking
diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
new file mode 100644
index 00000000000..5cb86e116a7
--- /dev/null
+++ b/contrib/zenith/relsize_cache.c
@@ -0,0 +1,150 @@
+/*-------------------------------------------------------------------------
+ *
+ * relsize_cache.c
+ *      Relation size cache for better zentih performance.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  contrib/zenith/relsize_cache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pagestore_client.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/lwlock.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "catalog/pg_tablespace_d.h"
+#include "utils/dynahash.h"
+#include "utils/guc.h"
+
+
+typedef struct
+{
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} RelTag;
+
+typedef struct
+{
+	RelTag tag;
+	BlockNumber size;
+} RelSizeEntry;
+
+static HTAB *relsize_hash;
+static LWLockId relsize_lock;
+static int relsize_hash_size;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+static void
+zenith_smgr_shmem_startup(void)
+{
+	static HASHCTL info;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	relsize_lock = (LWLockId)GetNamedLWLockTranche("zenith_relsize");
+	info.keysize = sizeof(RelTag);
+	info.entrysize = sizeof(RelSizeEntry);
+	relsize_hash = ShmemInitHash("zenith_relsize",
+								 relsize_hash_size, relsize_hash_size,
+								 &info,
+								 HASH_ELEM | HASH_BLOBS);
+	LWLockRelease(AddinShmemInitLock);
+}
+
+bool
+get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size)
+{
+	bool found = false;
+	if (relsize_hash_size > 0)
+	{
+		RelTag tag;
+		RelSizeEntry* entry;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_SHARED);
+		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
+		if (entry != NULL)
+		{
+			*size = entry->size;
+			found = true;
+		}
+		LWLockRelease(relsize_lock);
+	}
+	return found;
+}
+
+void
+set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag tag;
+		RelSizeEntry* entry;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
+		entry->size = size;
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag tag;
+		RelSizeEntry* entry;
+		bool found;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
+		if (!found || entry->size < size)
+			entry->size = size;
+		LWLockRelease(relsize_lock);
+	}
+}
+
+void
+relsize_hash_init(void)
+{
+	DefineCustomIntVariable("zenith.relsize_hash_size",
+							"Sets the maximum number of cached relation sizes for zenith",
+							NULL,
+							&relsize_hash_size,
+							/* 
+							 * Size of cache entry is 20 bytes.
+							 * So 64 entry will take about 1.2 Mb,
+							 * which seems to be a reasonable default.
+							 */
+							64*1024,
+							0,
+							INT_MAX,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL,	NULL);
+
+	if (relsize_hash_size > 0)
+	{
+		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
+		RequestNamedLWLockTranche("zenith_relsize", 1);
+
+		prev_shmem_startup_hook = shmem_startup_hook;
+		shmem_startup_hook = zenith_smgr_shmem_startup;
+	}
+}
\ No newline at end of file

From 82d969e0b78c1851274a1989c1321da47154da55 Mon Sep 17 00:00:00 2001
From: Max Sharnoff <github@max.sharnoff.org>
Date: Tue, 31 Aug 2021 13:05:39 -0700
Subject: [PATCH 040/167] Cleanup walproposer changes from #60

Closes #66. Mostly corresponds to cleaning up the states we store. Goes
back to single states for each WalKeeper, and we perform blocking writes
for everything but sending the WAL itself.

A few things have been factored out into libpqwalproposer for
simplicity - like handling the nonblocking status of the connection
(even though it's only changed once).
---
 .../libpqwalproposer/libpqwalproposer.c       | 119 ++-
 src/backend/replication/walproposer.c         | 815 +++++++-----------
 src/backend/replication/walproposer_utils.c   | 137 ++-
 src/include/replication/walproposer.h         | 257 ++----
 4 files changed, 575 insertions(+), 753 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 63c90f5a54b..1b8a53b5066 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -11,6 +11,7 @@ void _PG_init(void);
 struct WalProposerConn
 {
 	PGconn* pg_conn;
+	bool    is_nonblocking; /* whether the connection is non-blocking */
 };
 
 /* Prototypes for exported functions */
@@ -20,13 +21,12 @@ static WalProposerConn*					libpqprop_connect_start(char* conninfo);
 static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn* conn);
 static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
 static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
-static int								libpqprop_set_nonblocking(WalProposerConn* conn, int arg);
 static pgsocket							libpqprop_socket(WalProposerConn* conn);
-static int								libpqprop_flush(WalProposerConn* conn);
-static int								libpqprop_consume_input(WalProposerConn* conn);
+static int								libpqprop_flush(WalProposerConn* conn, bool socket_read_ready);
 static void								libpqprop_finish(WalProposerConn* conn);
 static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
 static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
+static bool                             libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size);
 
 static WalProposerFunctionsType PQWalProposerFunctions = {
 	libpqprop_error_message,
@@ -35,13 +35,12 @@ static WalProposerFunctionsType PQWalProposerFunctions = {
 	libpqprop_connect_poll,
 	libpqprop_send_query,
 	libpqprop_get_query_result,
-	libpqprop_set_nonblocking,
 	libpqprop_socket,
 	libpqprop_flush,
-	libpqprop_consume_input,
 	libpqprop_finish,
 	libpqprop_async_read,
 	libpqprop_async_write,
+	libpqprop_blocking_write,
 };
 
 /* Module initialization */
@@ -53,6 +52,22 @@ _PG_init(void)
 	WalProposerFunctions = &PQWalProposerFunctions;
 }
 
+/* Helper function */
+static bool
+ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking)
+{
+	/* If we're already correctly blocking or nonblocking, all good */
+	if (is_nonblocking == conn->is_nonblocking)
+		return true;
+
+	/* Otherwise, set it appropriately */
+	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
+		return false;
+
+	conn->is_nonblocking = is_nonblocking;
+	return true;
+}
+
 /* Exported function definitions */
 static char*
 libpqprop_error_message(WalProposerConn* conn)
@@ -96,6 +111,7 @@ libpqprop_connect_start(char* conninfo)
 	 */
 	conn = palloc(sizeof(WalProposerConn));
 	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false; /* connections always start in blocking mode */
 	return conn;
 }
 
@@ -133,22 +149,16 @@ libpqprop_connect_poll(WalProposerConn* conn)
 static bool
 libpqprop_send_query(WalProposerConn* conn, char* query)
 {
-	int  result;
-	bool return_val;
+	/* We need to be in blocking mode for sending the query to run without
+	 * requiring a call to PQflush */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
 
-	switch ((result = PQsendQuery(conn->pg_conn, query)))
-	{
-		case 0:
-			return_val = false;
-			break;
-		case 1:
-			return_val = true;
-			break;
-		default:
-			elog(FATAL, "unexpected return %d from PQsendQuery", result);
-	}
+	/* PQsendQuery returns 1 on success, 0 on failure */
+	if (!PQsendQuery(conn->pg_conn, query))
+		return false;
 
-	return return_val;
+	return true;
 }
 
 static WalProposerExecStatusType
@@ -160,6 +170,10 @@ libpqprop_get_query_result(WalProposerConn* conn)
 	/* Marker variable if we need to log an unexpected success result */
 	char* unexpected_success = NULL;
 
+	/* Consume any input that we might be missing */
+	if (!PQconsumeInput(conn->pg_conn))
+		return WP_EXEC_FAILED;
+
 	if (PQisBusy(conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;
 
@@ -218,12 +232,6 @@ libpqprop_get_query_result(WalProposerConn* conn)
 	return return_val;
 }
 
-static int
-libpqprop_set_nonblocking(WalProposerConn* conn, int arg)
-{
-	return PQsetnonblocking(conn->pg_conn, arg);
-}
-
 static pgsocket
 libpqprop_socket(WalProposerConn* conn)
 {
@@ -231,15 +239,14 @@ libpqprop_socket(WalProposerConn* conn)
 }
 
 static int
-libpqprop_flush(WalProposerConn* conn)
+libpqprop_flush(WalProposerConn* conn, bool socket_read_ready)
 {
-	return (PQflush(conn->pg_conn));
-}
+	/* If the socket is read-ready, we have to call PQconsumeInput before
+	 * calling PQflush (according to libpq docs) */
+	if (socket_read_ready && !PQconsumeInput(conn->pg_conn))
+		return -1; /* return failure if PQconsumeInput fails */
 
-static int
-libpqprop_consume_input(WalProposerConn* conn)
-{
-	return (PQconsumeInput(conn->pg_conn));
+	return (PQflush(conn->pg_conn));
 }
 
 static void
@@ -254,6 +261,10 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 {
 	int result;
 
+	/* Call PQconsumeInput so that we have the data we need */
+	if (!PQconsumeInput(conn->pg_conn))
+		return PG_ASYNC_READ_FAIL;
+
 	/* The docs for PQgetCopyData list the return values as:
 	 *      0 if the copy is still in progress, but no "complete row" is
 	 *        available
@@ -267,7 +278,7 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
 	{
 		case 0:
-			return PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN;
+			return PG_ASYNC_READ_TRY_AGAIN;
 		case -1:
 			/* As mentioned above; this shouldn't happen */
 			elog(FATAL, "unexpected return -1 from PQgetCopyData");
@@ -286,21 +297,26 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
 {
 	int result;
 
+	/* If we aren't in non-blocking mode, switch to it. */
+	if (!ensure_nonblocking_status(conn, true))
+		return PG_ASYNC_WRITE_FAIL;
+
 	/* The docs for PQputcopyData list the return values as:
 	 *   1 if the data was queued,
 	 *   0 if it was not queued because of full buffers, or
 	 *  -1 if an error occured
 	 */
-	switch (result = PQputCopyData(conn->pg_conn, buf, size))
+	result = PQputCopyData(conn->pg_conn, buf, size);
+
+	/* We won't get a result of zero because walproposer always empties the
+	 * connection's buffers before sending more */
+	Assert(result != 0);
+
+	switch (result)
 	{
 		case 1:
 			/* good -- continue */
 			break;
-		case 0:
-			/* FIXME: can this ever happen? the structure of walproposer
-			 * should always empty the connection's buffers before trying
-			 * to send more, right? */
-			return PG_ASYNC_WRITE_WOULDBLOCK;
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
@@ -327,3 +343,28 @@ libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
 			elog(FATAL, "invalid return %d from PQflush", result);
 	}
 }
+
+static bool
+libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size)
+{
+	int result;
+
+	/* If we are in non-blocking mode, switch out of it. */
+	if (!ensure_nonblocking_status(conn, false))
+		return false;
+
+	/* Ths function is very similar to libpqprop_async_write. For more
+	 * information, refer to the comments there */
+	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
+		return false;
+
+	Assert(result == 1);
+
+	/* Because the connection is non-blocking, flushing returns 0 or -1 */
+
+	if ((result = PQflush(conn->pg_conn)) == -1)
+		return false;
+
+	Assert(result == 0);
+	return true;
+}
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 8e46f52b15f..1bbe5f30b3a 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -54,11 +54,13 @@ static int          donor;     /* Most advanced acceptor */
 static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
-static uint32       request_poll_immediate; /* bitset of walkeepers requesting AdvancePollState */
 
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
-static bool ReadPGAsyncIntoValue(int i, void* value, size_t value_size);
+static bool AsyncRead(int i, void* value, size_t value_size);
+static bool BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state);
+static bool AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
+static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
 static void HackyRemoveWalProposerEvent(int to_remove);
 
 /*
@@ -104,39 +106,20 @@ InitEventSet(void)
 }
 
 /*
- * Updates the stored wait event for the walkeeper, given its current sockWaitState
+ * Updates the events we're already waiting on for the WAL keeper, setting it to
+ * the provided `events`
  *
- * remove_if_nothing specifies whether to remove the event if the new waiting set is empty. In
- * certain cases, we have remove_if_nothing = false because it's known that the walkeeper state will
- * be updated immediately after if it's not waiting for any events.
- *
- * In general, setting remove_if_nothing = false is just an optimization; setting it to true will
- * almost always be correct. Please leave a comment arguing for the validity of this optimization if
- * you use it.
+ * This function is called any time the WAL keeper's state switches to one where
+ * it has to wait to continue. This includes the full body of AdvancePollState
+ * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
  */
 static void
-UpdateEventSet(int i, bool remove_if_nothing)
+UpdateEventSet(WalKeeper* wk, uint32 events)
 {
-	uint32 events;
-	WalKeeper* wk = &walkeeper[i];
-
-	/*
-	 * If there isn't an applicable way to update the event, we just don't bother. This function is
-	 * sometimes called when the walkeeper isn't waiting for anything, and so the best thing to do
-	 * is just nothing.
-	 */
-	if (wk->sockWaitState != WANTS_NO_WAIT)
-	{
-		events = WaitKindAsEvents(wk->sockWaitState);
+	/* eventPos = -1 when we don't have an event */
+	Assert(wk->eventPos != -1);
 
-		/* If we don't already have an event, add one! */
-		if (wk->eventPos == -1)
-			wk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(wk->conn), NULL, wk);
-		else
-			ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
-	}
-	else if (remove_if_nothing && wk->eventPos != 1)
-		HackyRemoveWalProposerEvent(i);
+	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
 }
 
 /* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
@@ -155,54 +138,43 @@ HackyRemoveWalProposerEvent(int to_remove)
 	InitEventSet();
 
 	/* loop through the existing walkeepers. If they aren't the one we're removing, and if they have
-	 * a socket we can use, re-add the applicable events.
-	 *
-	 * We're expecting that there's no other walkeepers with `.sockWaitState = WANTS_NO_WAIT`,
-	 * because any state without waiting should should have been handled immediately. */
+	 * a socket we can use, re-add the applicable events. */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		walkeeper[i].eventPos = -1;
+		uint32 desired_events = WL_NO_EVENTS;
+		WalKeeper* wk = &walkeeper[i];
+
+		wk->eventPos = -1;
 
 		if (i == to_remove)
 			continue;
 
-		if (walkeeper[i].conn)
+		/* If this WAL keeper isn't offline, add an event for it! */
+		if ((desired_events = WalKeeperStateDesiredEvents(wk->state)))
 		{
-			UpdateEventSet(i, false);
-
-			if (walkeeper[i].sockWaitState == WANTS_NO_WAIT)
-			{
-				elog(FATAL, "Unexpected walkeeper %s:%s in %s state waiting for nothing",
-					 walkeeper[i].host, walkeeper[i].port, FormatWalKeeperState(walkeeper[i].state));
-			}
-			else
-			{
-				UpdateEventSet(i, false); /* Will either add an event or do nothing */
-			}
+			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
 		}
 	}
 }
 
 /* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
 static void
-ShutdownConnection(int i, bool remove_event)
+ShutdownConnection(int i)
 {
 	if (walkeeper[i].conn)
 		walprop_finish(walkeeper[i].conn);
 	walkeeper[i].conn = NULL;
 	walkeeper[i].state = SS_OFFLINE;
-	walkeeper[i].pollState = SPOLL_NONE;
-	walkeeper[i].sockWaitState = WANTS_NO_WAIT;
 	walkeeper[i].currMsg = NULL;
 
-	if (remove_event)
-		HackyRemoveWalProposerEvent(i);
+	HackyRemoveWalProposerEvent(i);
 }
 
 /*
- * This function is called to establish new connection or to reestablish connection in case
- * of connection failure.
- * Close current connection if any and try to initiate new one
+ * This function is called to establish new connection or to reestablish
+ * connection in case of connection failure.
+ *
+ * On success, sets the state to SS_CONNECTING_WRITE.
  */
 static void
 ResetConnection(int i)
@@ -214,7 +186,7 @@ ResetConnection(int i)
 	{
 		elog(WARNING, "Connection with node %s:%s in %s state failed",
 			wk->host, wk->port, FormatWalKeeperState(wk->state));
-		ShutdownConnection(i, true);
+		ShutdownConnection(i);
 	}
 
 	/* Try to establish new connection
@@ -234,9 +206,6 @@ ResetConnection(int i)
 	if (!wk->conn)
 		elog(FATAL, "failed to allocate new PGconn object");
 
-	/* The connection should always be non-blocking. It's easiest to just set that here. */
-	walprop_set_nonblocking(wk->conn, true);
-
 	/* PQconnectStart won't actually start connecting until we run PQconnectPoll. Before we do that
 	 * though, we need to check that it didn't immediately fail. */
 	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
@@ -267,9 +236,7 @@ ResetConnection(int i)
 	 */
 	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
 
-	wk->state = SS_CONNECTING;
-	wk->pollState = SPOLL_CONNECT;
-	wk->sockWaitState = WANTS_SOCK_WRITE;
+	wk->state = SS_CONNECTING_WRITE;
 
 	sock = walprop_socket(wk->conn);
 	wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, wk);
@@ -467,6 +434,9 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 
 /*
  * Send message to the particular node
+ *
+ * Always updates the state and event set for the WAL keeper; setting either of
+ * these before calling would be redundant work.
  */
 static void
 SendMessageToNode(int i, WalMessage* msg)
@@ -492,19 +462,15 @@ SendMessageToNode(int i, WalMessage* msg)
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 
 		/* Once we've selected and set up our message, actually start sending it. */
-		wk->state         = SS_SEND_WAL;
-		wk->pollState     = SPOLL_NONE;
-		wk->sockWaitState = WANTS_NO_WAIT;
+		wk->state = SS_SEND_WAL;
 		/* Don't ned to update the event set; that's done by AdvancePollState */
 
 		AdvancePollState(i, WL_NO_EVENTS);
 	}
 	else
 	{
-		wk->state         = SS_IDLE;
-		wk->pollState     = SPOLL_IDLE;
-		wk->sockWaitState = WANTS_SOCK_READ;
-		UpdateEventSet(i, true);
+		wk->state = SS_IDLE;
+		UpdateEventSet(wk, WL_SOCKET_READABLE);
 	}
 }
 
@@ -761,45 +727,6 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	return true;
 }
 
-/* Requests the currently-running WalProposerPoll to advance the state of this walkeeper */
-static void
-RequestStateAdvanceNoPoll(int i)
-{
-	/* We only have to change the value here; it'll be detected in a call to
-	 * AdvancePollForAllRequested when that's made. */
-	request_poll_immediate |= (1 << i);
-}
-
-static void
-AdvancePollForAllRequested(void)
-{
-	uint32 poll_set = request_poll_immediate;
-
-	/*
-	 * We have this in a loop because -- in theory -- polling the requested states could produce
-	 * more that are ready to be polled, though this *really* shouldn't occur in practice.
-	 */
-	while ((poll_set = request_poll_immediate))
-	{
-		/* "Take responsibility" for the poll set. We don't want any possibility of other calls to
-		 * AdvancePollForAllRequested duplicating an AdvancePollState. */
-		request_poll_immediate = 0;
-
-		/*
-		 * Loop through all nonzero bits and call AdvancePollState
-		 *
-		 * FIXME: This can probably be much more efficient, using something like __builtin__clz.
-		 * Maybe it doesn't matter though.
-		 */
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			/* If the ith bit is set, that state requested advancement */
-			if (poll_set & (1 << i))
-				AdvancePollState(i, WL_NO_EVENTS);
-		}
-	}
-}
-
 /*
  * Advance the WAL proposer state machine, waiting each time for events to occur
  */
@@ -819,22 +746,12 @@ WalProposerPoll(void)
 		wk = (WalKeeper*) event.user_data;
 		i = (int)(wk - walkeeper);
 
-		if (rc != 0)
-		{
-			/*
-			 * If the event contains something that one of our walkeeper states
-			 * was waiting for, we'll advance its state.
-			 */
-			if (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
-				AdvancePollState(i, event.events);
-
-			/*
-			 * It's possible for AdvancePollState to result in extra states
-			 * being ready to immediately advance to the next state (with
-			 * pollState = SPOLL_NONE). We deal with that here.
-			 */
-			AdvancePollForAllRequested();
-		}
+		/*
+		 * If the event contains something that one of our walkeeper states
+		 * was waiting for, we'll advance its state.
+		 */
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)))
+			AdvancePollState(i, event.events);
 
 		/* If the timeout expired, attempt to reconnect to any walkeepers that we dropped */
 		ReconnectWalKeepers();
@@ -852,69 +769,69 @@ WalProposerPoll(void)
 	}
 }
 
-/* Performs the logic for advancing the state machine of the 'i'th walkeeper, given that a certain
- * set of events has occured. */
+/* Performs the logic for advancing the state machine of the 'i'th walkeeper,
+ * given that a certain set of events has occured. */
 static void
 AdvancePollState(int i, uint32 events)
 {
 	WalKeeper* wk = &walkeeper[i];
 
-	/* Continue polling all the while we don't need to wait.
-	 *
-	 * At the bottom of this function is "while (walkeeper[i].sockWaitState == WANTS_NO_WAIT)" */
-	do {
-		uint32 expected_events = WaitKindAsEvents(wk->sockWaitState);
+	/* Keep advancing the state while either:
+	 *   (a) the event is still unprocessed (usually because it's the first
+	 *       iteration of the loop), or
+	 *   (b) the state can execute, and does not need to wait for any socket
+	 *       events
+	 */
+	while (events || StateShouldImmediatelyExecute(wk->state))
+	{
+		/* Sanity check. We assume further down that the operations don't block
+		 * because the socket is ready. */
+		AssertEventsOkForState(events, wk);
 
-		/* If we were expecting SOME event but nothing happened, panic. */
-		if ((expected_events & events) == 0 && expected_events)
+		/* Execute the code corresponding to the current state */
+		switch (wk->state)
 		{
-			elog(FATAL,
-				 "unexpected event for WalKeeper poll. Expected %s, found code %s (see: FormatEvents).",
-				 FormatWKSockWaitKind(wk->sockWaitState), FormatEvents(events));
-		}
+			/* WAL keepers are only taken out of SS_OFFLINE by calls to
+			 * ResetConnection */
+			case SS_OFFLINE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
+					 wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
 
-		/* Now that we've checked the event is ok, we'll actually run the thing we're looking for */
-		switch (wk->pollState)
-		{
-			/* If the polling corresponds to a "full" operation, we'll skip straight to that - we
-			 * don't actually need to poll here. */
-			case SPOLL_NONE:
-			case SPOLL_RETRY:
-				/* Equivalent to 'break', but more descriptive. */
-				goto ExecuteNextProtocolState;
-
-			/* On idle polling states, we wait for the socket to open for reading. If this happens,
-			 * the connection has closed *normally*, so we're just done. */
-			case SPOLL_IDLE:
-				elog(LOG, "Walkeeper %s:%s closed connection from %s state",
-						wk->host, wk->port, FormatWalKeeperState(wk->state));
-				/* 'true' to remove existing event for this walkeeper */
-				ShutdownConnection(i, true);
-				return;
-
-			/* Call PQconnectPoll to finalize the connection */
-			case SPOLL_CONNECT:
+			/* Both connecting states run the same logic. The only difference is
+			 * the events they're expecting */
+			case SS_CONNECTING_READ:
+			case SS_CONNECTING_WRITE:
 			{
 				WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
-				pgsocket                         new_sock = walprop_socket(wk->conn);
+
+				/* The new set of events we'll wait on, after updating */
+				uint32 new_events = WL_NO_EVENTS;
 
 				switch (result)
 				{
 					case WP_CONN_POLLING_OK:
 						elog(LOG, "Connected with node %s:%s", wk->host, wk->port);
 
-						/* If we're fully connected, we're good! We can move on to the next state */
+						/* Once we're fully connected, we can move to the next state */
 						wk->state = SS_EXEC_STARTWALPUSH;
 
-						/* Update the socket -- it might have changed */
-						HackyRemoveWalProposerEvent(i);
-
-						/* We need to just pick an event to wait on; this will be overriden
-						 * anyways later. */
-						wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, new_sock, NULL, wk);
+						/* Even though SS_EXEC_STARTWALPUSH doesn't wait on anything,
+						 * we do need to replace the current event, so we have to
+						 * just pick something. We'll eventually need the socket to
+						 * be readable, so we go with that. */
+						new_events = WL_SOCKET_READABLE;
+						break;
 
-						/* We're done, but some of the other result cases have cleanup left to do */
-						goto ExecuteNextProtocolState;
+					/* If we need to poll to finish connecting, continue doing that */
+					case WP_CONN_POLLING_READING:
+						wk->state = SS_CONNECTING_READ;
+						new_events = WL_SOCKET_READABLE;
+						break;
+					case WP_CONN_POLLING_WRITING:
+						wk->state = SS_CONNECTING_WRITE;
+						new_events = WL_SOCKET_WRITEABLE;
+						break;
 
 					case WP_CONN_POLLING_FAILED:
 						elog(WARNING, "Failed to connect to node '%s:%s': %s",
@@ -922,154 +839,21 @@ AdvancePollState(int i, uint32 events)
 						/* If connecting failed, we don't want to restart the connection because
 						 * that might run us into a loop. Instead, shut it down -- it'll naturally
 						 * restart at a slower interval on calls to ReconnectWalKeepers. */
-						ShutdownConnection(i, true);
+						ShutdownConnection(i);
 						return;
-
-					case WP_CONN_POLLING_READING:
-						wk->sockWaitState = WANTS_SOCK_READ;
-						break;
-
-					case WP_CONN_POLLING_WRITING:
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
 				}
 
-				/* If we got here, we either have to wait for reading or
-				 * writing. The value of walkeeper[i].sockWaitState indicates
-				 * which one of these it is.
-				 *
-				 * We also have to update the socket here, even if the file
-				 * descriptor itself hasn't changed. It's possible for libpq to
-				 * close the socket and then open a new one, reusing the same
-				 * file descriptor. If this happens, epoll will have
-				 * automatically removed the socket, so we'll stop receiving
-				 * events for it unless we re-add the socket.
-				 *
-				 * To update the socket, we the event and add a new one back.
-				 */
+				/* Because PQconnectPoll can change the socket, we have to
+				 * un-register the old event and re-register an event on the new
+				 * socket. */
 				HackyRemoveWalProposerEvent(i);
-
-				wk->eventPos = AddWaitEventToSet(waitEvents, WaitKindAsEvents(wk->sockWaitState), new_sock, NULL, wk);
-
-				/* We still have polling to do, so we can't move on to the next state. */
-				return;
-			}
-
-			case SPOLL_WRITE_PQ_FLUSH:
-			{
-				int flush_result;
-
-				/* If the socket is ready for reading, we have to call PQconsumeInput before
-				 * attempting to flush. */
-				if (events & WL_SOCKET_READABLE)
-				{
-					/* PQconsumeInput returns 1 if ok, 0 if there was an error */
-					if (!walprop_consume_input(wk->conn))
-					{
-						elog(WARNING, "Failed to pre-flush read input for node %s:%s in state [%s]: %s",
-							 wk->host, wk->port, FormatWalKeeperState(wk->state),
-							 walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-					}
-				}
-
-				/* PQflush returns:
-				 *   0 if uccessful,
-				 *   1 if unable to send everything yet,
-				 *  -1 if it failed */
-				switch (flush_result = walprop_flush(wk->conn))
-				{
-					case 0:
-						/* On success, go to the next state. Our current state only indicates the
-						 * state that *started* the writing, so we need to use that to figure out
-						 * what to do next. */
-						switch (wk->state)
-						{
-							case SS_EXEC_STARTWALPUSH:
-								wk->state = SS_WAIT_EXEC_RESULT;
-								break;
-							case SS_HANDSHAKE_SEND:
-								wk->state = SS_HANDSHAKE_RECV;
-								break;
-							case SS_SEND_VOTE:
-								wk->state = SS_WAIT_VERDICT;
-								break;
-							case SS_SEND_WAL:
-								wk->state = SS_RECV_FEEDBACK;
-								break;
-							default:
-								elog(FATAL, "Unexpected writing state [%s] for node %s:%s",
-									FormatWalKeeperState(wk->state), wk->host, wk->port);
-						}
-
-						wk->pollState = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case 1:
-						/* Nothing more to do - we'll just have to wait until we can flush again */
-						return;
-					case -1:
-						elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-							 wk->host, wk->port, FormatWalKeeperState(wk->state),
-							 walprop_error_message(wk->conn));
-						ResetConnection(i);
-						break;
-					default:
-						elog(FATAL, "invalid return %d from PQflush", flush_result);
-				}
+				wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
 				break;
 			}
 
-			case SPOLL_PQ_CONSUME_AND_RETRY:
-				/* PQconsumeInput returns 1 on success (though maybe nothing was read), and 0 on
-				 * failure. */
-				if (walprop_consume_input(wk->conn))
-					/* On success, retry the operation */
-					goto ExecuteNextProtocolState;
-				else
-				{
-					/* On failure, print the failure and move on */
-					elog(WARNING, "Failed to read input for node %s:%s in state %s: %s",
-						wk->host, wk->port, FormatWalKeeperState(wk->state),
-						walprop_error_message(wk->conn));
-					ResetConnection(i);
-					return;
-				}
-		}
-
-ExecuteNextProtocolState:
-		/* If we get here, walkeeper[i].pollState now corresponds to either SPOLL_NONE or
-		 * SPOLL_RETRY. In either case, we should execute the operation described by the high-level
-		 * state.
-		 *
-		 * All of the cases in this switch statement are provided in the order that state
-		 * transitions happen, moving downwards. So `SS_CONNECTING` moves into
-		 * `SS_EXEC_STARTWALPUSH`, `SS_EXEC_STARTWALPUSH` moves into `SS_WAIT_EXEC_RESULT`, etc.
-		 *
-		 * If/when new states are added, they should abide by the same formatting.
-		 *
-		 * More information about the high-level flow between states is available in the comments
-		 * for WalKeeperState. */
-		switch (wk->state)
-		{
-			/* walkeepers aren't taken out of SS_OFFLINE by polling. */
-			case SS_OFFLINE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline", wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
-
-			/* Connecting is handled by the SPOLL_CONNECT, which then puts us into
-			 * SS_EXEC_STARTWALPUSH. There's no singular state advancement to be made here. */
-			case SS_CONNECTING:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is connecting", wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
-
-			/* Send "START_WAL_PUSH" command to the walkeeper. After sending, wait for response with
-			 * SS_WAIT_EXEC_RESULT */
+			/* Send "START_WAL_PUSH" command to the walkeeper. After sending,
+			 * wait for response with SS_WAIT_EXEC_RESULT */
 			case SS_EXEC_STARTWALPUSH:
-			{
-				int flush_result;
-
 				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
@@ -1078,53 +862,25 @@ AdvancePollState(int i, uint32 events)
 					return;
 				}
 
-				/* The query has been started (put into buffers), but hasn't been flushed yet. We
-				 * should do that now. If there's more flushing required, keep doing that until it's
-				 * done */
-				switch ((flush_result = walprop_flush(wk->conn)))
-				{
-					case 0:
-						/* success -- go to the next state */
-						wk->state = SS_WAIT_EXEC_RESULT;
-						wk->pollState = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case 1:
-						/* we'll have to flush again */
-						wk->pollState = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-					case -1:
-						elog(WARNING, "Failed to flush write to node %s:%s to exec command: %s",
-								wk->host, wk->port, walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-					default:
-						elog(FATAL, "invalid return %d from PQflush", flush_result);
-				}
-
-				/* If no waiting is required, we'll get to that shortly */
-				UpdateEventSet(i, false);
+				wk->state = SS_WAIT_EXEC_RESULT;
+				UpdateEventSet(wk, WL_SOCKET_READABLE);
 				break;
-			}
 
-			/* Waiting for the result of the "START_WAL_PUSH" command. If successful, proceed to
-			 * SS_HANDSHAKE_SEND. If needs more, wait until we can read and retry. */
 			case SS_WAIT_EXEC_RESULT:
-				/* Call our wrapper around PQisBusy + PQgetResult to inspect the result */
 				switch (walprop_get_query_result(wk->conn))
 				{
 					/* Successful result, move on to starting the handshake */
 					case WP_EXEC_SUCCESS_COPYBOTH:
-						wk->state         = SS_HANDSHAKE_SEND;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
+						/* Because this state is immediately executable, we'll
+						 * start this on the next iteration of the loop */
+						wk->state = SS_HANDSHAKE_SEND;
 						break;
 
-					/* We need more calls to PQconsumeInput to completely receive this result */
+					/* Needs repeated calls to finish. Wait until the socket is
+					 * readable */
 					case WP_EXEC_NEEDS_INPUT:
-						wk->pollState     = SPOLL_PQ_CONSUME_AND_RETRY;
-						wk->sockWaitState = WANTS_SOCK_READ;
+						/* SS_WAIT_EXEC_RESULT is always reached through an
+						 * event, so we don't need to update the event set */
 						break;
 
 					case WP_EXEC_FAILED:
@@ -1139,65 +895,34 @@ AdvancePollState(int i, uint32 events)
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
 								wk->host, wk->port);
 						ResetConnection(i);
-						break;
+						return;
 				}
-
-				/* If the wait state is empty, don't remove the event -- we have more work to do */
-				UpdateEventSet(i, false);
-
 				break;
 
-			/* Start handshake: first of all send information about server */
+			/* Start handshake: first of all send information about the WAL
+			 * keeper. After sending, we wait on SS_HANDSHAKE_RECV for a
+			 * response to finish the handshake. */
 			case SS_HANDSHAKE_SEND:
-				/* Note: This state corresponds to the process of sending the relevant information
-				 * along. The moment we finish sending, we use SS_HANDSHAKE_RECV to complete the
-				 * handshake. */
-				switch (walprop_async_write(wk->conn, &proposerGreeting, sizeof(proposerGreeting)))
-				{
-					case PG_ASYNC_WRITE_SUCCESS:
-						/* If the write immediately succeeds, we can move on to the next state. */
-						wk->state         = SS_HANDSHAKE_RECV;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-
-					case PG_ASYNC_WRITE_WOULDBLOCK:
-						/* Wait until the socket is write-ready and try again */
-						wk->pollState     = SPOLL_RETRY;
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
-
-					case PG_ASYNC_WRITE_TRY_FLUSH:
-						/* We need to call PQflush some number of additional times, with different
-						 * actions depending on whether the socket is readable or writable */
-						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-
-					case PG_ASYNC_WRITE_FAIL:
-						/* On failure, print the error and reset the connection */
-						elog(WARNING, "Handshake with node %s:%s failed to start: %s",
-								wk->host, wk->port, walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-				}
+				/* On failure, logging & resetting the connection is handled. We
+				 * just need to handle the control flow. */
+				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
+					return;
 
-				/* Update the event set for this walkeeper, depending on what it's been changed to
-				 *
-				 * We set remove_if_nothing = false because we'll immediately execute
-				 * SS_HANDSHAKE_RECV on the next iteration of the outer loop. */
-				UpdateEventSet(i, false);
 				break;
 
-			/* Finish handshake comms: receive information about the walkeeper */
+			/* Finish handshake comms: receive information about the WAL keeper */
 			case SS_HANDSHAKE_RECV:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->greet, sizeof(wk->greet)))
+				if (!AsyncRead(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
-				wk->state     = SS_VOTING;
-				wk->pollState = SPOLL_IDLE;
+				/* Protocol is all good, move to voting. */
+				wk->state = SS_VOTING;
+				/* Don't need to update the event set yet. Either we update the
+				 * event set to WL_SOCKET_READABLE *or* we change the state to
+				 * SS_SEND_VOTE in the loop below */
+				UpdateEventSet(wk, WL_SOCKET_READABLE);
 				wk->feedback.flushLsn = truncateLsn;
 				wk->feedback.hs.ts = 0;
 
@@ -1211,7 +936,13 @@ AdvancePollState(int i, uint32 events)
 				 * We'll eventually get a task when the election starts.
 				 *
 				 * If we do have quorum, we can start an election */
-				if (++n_connected >= quorum)
+				if (++n_connected < quorum)
+				{
+					/* SS_VOTING is an idle state; read-ready indicates the
+					 * connection closed. */
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+				}
+				else
 				{
 					if (n_connected == quorum)
 					{
@@ -1233,13 +964,8 @@ AdvancePollState(int i, uint32 events)
 						if (walkeeper[j].state == SS_VOTING)
 						{
 							walkeeper[j].state = SS_SEND_VOTE;
-							walkeeper[j].pollState = SPOLL_NONE;
-							walkeeper[j].sockWaitState = WANTS_NO_WAIT;
-
-							/* If this isn't the current walkeeper, defer handling this state until
-							 * later. We'll mark it for individual work in WalProposerPoll. */
-							if (j != i)
-								RequestStateAdvanceNoPoll(j);
+							/* Immediately send info */
+							AdvancePollState(j, WL_NO_EVENTS);
 						}
 					}
 				}
@@ -1249,53 +975,26 @@ AdvancePollState(int i, uint32 events)
 			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
 			 * SS_SEND_VOTE. */
 			case SS_VOTING:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting", wk->host, wk->port);
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting",
+					 wk->host, wk->port);
 				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
 
 			/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
-				switch (walprop_async_write(wk->conn, &voteRequest, sizeof(voteRequest)))
-				{
-					case PG_ASYNC_WRITE_SUCCESS:
-						/* If the write immediately succeeds, we can move on to the next state. */
-						wk->state         = SS_WAIT_VERDICT;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case PG_ASYNC_WRITE_WOULDBLOCK:
-						/* Wait until the socket is write-ready and try again */
-						wk->pollState     = SPOLL_RETRY;
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
-					case PG_ASYNC_WRITE_TRY_FLUSH:
-						/* We need to call PQflush some number of additional times, with different
-						 * actions depending on whether the socket is readable or writable */
-						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-					case PG_ASYNC_WRITE_FAIL:
-						/* Report the failure and reset the connection; there isn't much
-						 * more we can do. */
-						elog(WARNING, "Failed to send vote request to node %s:%s: %s",
-								wk->host, wk->port,
-								walprop_error_message(wk->conn));
-						ResetConnection(i);
-						return;
-				}
+				/* On failure, logging & resetting is handled */
+				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+					return;
 
-				/* Don't remove from the event set if there's nothing we're waiting for; we'll get
-				 * it on the next iteration of the loop */
-				UpdateEventSet(i, false);
+				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 				break;
 
 			/* Start reading the walkeeper response for our candidate */
 			case SS_WAIT_VERDICT:
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->voteResponse, sizeof(wk->voteResponse)))
+				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
-
 				/*
 				 * In case of acceptor rejecting our vote, bail out, but only if
 				 * either it already lives in strictly higher term (concurrent
@@ -1312,12 +1011,17 @@ AdvancePollState(int i, uint32 events)
 				Assert(wk->voteResponse.term == propTerm);
 
 				/* Handshake completed, do we have quorum? */
-				wk->state         = SS_IDLE;
-				wk->pollState     = SPOLL_IDLE;
-				wk->sockWaitState = WANTS_NO_WAIT;
 
-				if (++n_votes == quorum)
+				if (++n_votes != quorum)
 				{
+					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+					SendMessageToNode(i, msgQueueHead);
+				}
+				else
+				{
+					wk->state = SS_IDLE;
+					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for read-ready */
+
 					DetermineEpochStartLsn();
 
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
@@ -1332,56 +1036,46 @@ AdvancePollState(int i, uint32 events)
 					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
 				}
-				else
-				{
-					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
-					SendMessageToNode(i, msgQueueHead);
-				}
 
 				break;
 
-			/* Start to send the message at wk->currMsg. Triggered only by calls to
+			/* Idle state for sending WAL. Moved out only by calls to
 			 * SendMessageToNode */
+			case SS_IDLE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
+				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+
+			/* Start to send the message at wk->currMsg. Triggered only by calls
+			 * to SendMessageToNode */
 			case SS_SEND_WAL:
 			{
 				WalMessage* msg = wk->currMsg;
 
-				/* Don't repeat logs if we have to retry the actual send operation itself */
-				if (wk->pollState != SPOLL_RETRY)
-				{
-					elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
-						 msg->size - sizeof(AppendRequestHeader),
-						 LSN_FORMAT_ARGS(msg->req.commitLsn),
-						 LSN_FORMAT_ARGS(truncateLsn),
-						 wk->host, wk->port);
-				}
+				elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
+					 msg->size - sizeof(AppendRequestHeader),
+					 LSN_FORMAT_ARGS(msg->req.commitLsn),
+					 LSN_FORMAT_ARGS(truncateLsn),
+					 wk->host, wk->port);
 
-				switch (walprop_async_write(wk->conn, &msg->req, msg->size))
-				{
-					case PG_ASYNC_WRITE_SUCCESS:
-						wk->state         = SS_RECV_FEEDBACK;
-						wk->pollState     = SPOLL_NONE;
-						wk->sockWaitState = WANTS_NO_WAIT;
-						break;
-					case PG_ASYNC_WRITE_WOULDBLOCK:
-						wk->pollState = SPOLL_RETRY;
-						wk->sockWaitState = WANTS_SOCK_WRITE;
-						break;
-					case PG_ASYNC_WRITE_TRY_FLUSH:
-						wk->pollState     = SPOLL_WRITE_PQ_FLUSH;
-						wk->sockWaitState = WANTS_SOCK_EITHER;
-						break;
-					case PG_ASYNC_WRITE_FAIL:
-						elog(WARNING, "Failed to send WAL to node %s:%s: %s",
-							 wk->host, wk->port, walprop_error_message(wk->conn));
-				}
+				/* We write with msg->size here because the body of the message
+				 * is stored after the end of the WalMessage struct, in the
+				 * allocation for each msg */
+				if (!AsyncWrite(i, &msg->req, msg->size, SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+					return;
 
-				/* Don't remove if if sockWaitState == WANTS_NO_WAIT, because we'll immediately move
-				 * on to SS_RECV_FEEDBACK if that's the case. */
-				UpdateEventSet(i, false);
 				break;
 			}
 
+			/* Flush the WAL message we're sending from SS_SEND_WAL */
+			case SS_SEND_WAL_FLUSH:
+				/* AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll wait
+				 * until the next poll comes along. */
+				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0, SS_RECV_FEEDBACK))
+					return;
+
+				break;
+
 			/* Start to receive the feedback from a message sent via SS_SEND_WAL */
 			case SS_RECV_FEEDBACK:
 			{
@@ -1391,21 +1085,16 @@ AdvancePollState(int i, uint32 events)
 
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
-				if (!ReadPGAsyncIntoValue(i, &wk->feedback, sizeof(wk->feedback)))
+				if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
 					return;
 
 				next = wk->currMsg->next;
 				Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
 				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
 
-				wk->state         = SS_IDLE;
-				wk->pollState     = SPOLL_IDLE;
-				wk->sockWaitState = WANTS_NO_WAIT;
-				/* Don't update the event set; that's handled by SendMessageToNode if necessary */
-
 				wk->currMsg = NULL;
 				HandleWalKeeperResponse();
-				SendMessageToNode(i, next);
+				SendMessageToNode(i, next); /* Updates state & event set */
 
 				/*
 				 * Also send the new commit lsn to all the walkeepers.
@@ -1424,30 +1113,29 @@ AdvancePollState(int i, uint32 events)
 				}
 				break;
 			}
-
-			/* Truly an idle state - there isn't any typ of advancement expected here. */
-			case SS_IDLE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
-				break; /* actually unreachable; makes the compiler happier */
 		}
 
-		/* On subsequent iterations of the loop, there's no additonal events to process */
+		/* We've already done something for these events - don't attempt more
+		 * states than we need to. */
 		events = WL_NO_EVENTS;
-	} while (walkeeper[i].sockWaitState == WANTS_NO_WAIT && walkeeper[i].pollState != SPOLL_IDLE);
+	}
 }
 
 /*
- * Reads a CopyData block into a value, returning whether the read was successful
+ * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
+ * returning whether the read was successful.
  *
- * If the read was not immediately successful (either polling is required, or it actually failed),
- * then the state is set appropriately on the walkeeper.
+ * If the read needs more polling, we return 'false' and keep the state
+ * unmodified, waiting until it becomes read-ready to try again. If it fully
+ * failed, a warning is emitted and the connection is reset.
  */
-bool
-ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
+static bool
+AsyncRead(int i, void* value, size_t value_size)
 {
 	WalKeeper* wk = &walkeeper[i];
 	char *buf = NULL;
 	int buf_size = -1;
+	uint32 events;
 
 	switch (walprop_async_read(wk->conn, &buf, &buf_size))
 	{
@@ -1455,14 +1143,10 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 		case PG_ASYNC_READ_SUCCESS:
 			break;
 
-		case PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN:
-			wk->pollState = SPOLL_PQ_CONSUME_AND_RETRY;
-
-			if (wk->sockWaitState != WANTS_SOCK_READ)
-			{
-				wk->sockWaitState = WANTS_SOCK_READ;
-				UpdateEventSet(i, true);
-			}
+		/* If we need more input, wait until the socket is read-ready and try
+		 * again. */
+		case PG_ASYNC_READ_TRY_AGAIN:
+			UpdateEventSet(wk, WL_SOCKET_READABLE);
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
@@ -1477,7 +1161,7 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 	/*
 	 * If we get here, the read was ok, but we still need to check it was the right amount
 	 */
-	if (buf_size != value_size)
+	if ((size_t) buf_size != value_size)
 	{
 		elog(FATAL,
 			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
@@ -1488,6 +1172,131 @@ ReadPGAsyncIntoValue(int i, void* value, size_t value_size)
 
 	/* Copy the resulting info into place */
 	memcpy(value, buf, buf_size);
+
+	/* Update the events for the WalKeeper, if it's going to wait */
+	events = WalKeeperStateDesiredEvents(wk->state);
+	if (events)
+		UpdateEventSet(wk, events);
+
+	return true;
+}
+
+/*
+ * Blocking equivalent to AsyncWrite.
+ *
+ * We use this everywhere messages are small enough that they should fit in a
+ * single packet.
+ */
+static bool
+BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
+{
+	WalKeeper* wk = &walkeeper[i];
+	uint32 events;
+
+	if (!walprop_blocking_write(wk->conn, msg, msg_size))
+	{
+		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+			 wk->host, wk->port, FormatWalKeeperState(wk->state),
+			 walprop_error_message(wk->conn));
+		ResetConnection(i);
+		return false;
+	}
+
+	wk->state = success_state;
+
+	/* If the new state will be waiting for events to happen, update the event
+	 * set to wait for those */
+	events = WalKeeperStateDesiredEvents(success_state);
+	if (events)
+		UpdateEventSet(wk, events);
+
+	return true;
+}
+
+/*
+ * Starts a write into the 'i'th WAL keeper's postgres connection, moving to
+ * success_state only when the write succeeds. If the write needs flushing,
+ * moves to flush_state.
+ *
+ * Returns false only if the write immediately fails. Upon failure, a warning is
+ * emitted and the connection is reset.
+ */
+static bool
+AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
+{
+	WalKeeper* wk = &walkeeper[i];
+	uint32 events;
+
+	switch (walprop_async_write(wk->conn, msg, msg_size))
+	{
+		case PG_ASYNC_WRITE_SUCCESS:
+			wk->state = success_state;
+			break;
+		case PG_ASYNC_WRITE_TRY_FLUSH:
+			/* We still need to call PQflush some more to finish the job; go to
+			 * the appropriate state. Update the event set at the bottom of this
+			 * function */
+			wk->state = flush_state;
+			break;
+		case PG_ASYNC_WRITE_FAIL:
+			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+				 wk->host, wk->port, FormatWalKeeperState(wk->state),
+				 walprop_error_message(wk->conn));
+			ResetConnection(i);
+			return false;
+	}
+
+	/* If the new state will be waiting for something, update the event set */
+	events = WalKeeperStateDesiredEvents(wk->state);
+	if (events)
+		UpdateEventSet(wk, events);
+
+	return true;
+}
+
+/*
+ * Flushes a previous call to AsyncWrite. This only needs to be called when the
+ * socket becomes read or write ready *after* calling AsyncWrite.
+ *
+ * If flushing completes, moves to 'success_state' and returns true. If more
+ * flushes are needed, does nothing and returns true.
+ *
+ * On failure, emits a warning, resets the connection, and returns false.
+ */
+static bool
+AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
+{
+	WalKeeper* wk = &walkeeper[i];
+	uint32 events;
+
+	/* PQflush returns:
+	 *   0 if successful                    [we're good to move on]
+	 *   1 if unable to send everything yet [call PQflush again]
+	 *  -1 if it failed                     [emit an error]
+	 */
+	switch (walprop_flush(wk->conn, socket_read_ready))
+	{
+		case 0:
+			/* On success, move to the next state - that logic is further down */
+			break;
+		case 1:
+			/* Nothing to do; try again when the socket's ready */
+			return true;
+		case -1:
+			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
+				 wk->host, wk->port, FormatWalKeeperState(wk->state),
+				 walprop_error_message(wk->conn));
+			ResetConnection(i);
+			return false;
+	}
+
+	wk->state = success_state;
+
+	/* If the new state will be waiting for something, update the event set */
+	events = WalKeeperStateDesiredEvents(wk->state);
+	if (events)
+		UpdateEventSet(wk, events);
+
 	return true;
 }
 
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 29c209e63c1..16d84ac7f17 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -21,36 +21,6 @@ CompareLsn(const void *a, const void *b)
 		return 1;
 }
 
-/* Converts a `WKSockWaitKind` into the bit flags that would match it
- *
- * Note: For `wait_kind = WANTS_NO_WAIT`, this will return a value of zero,
- * which does not match any events. Attempting to wait on no events will
- * always timeout, so it's best to double-check the value being provided to
- * this function where necessary. */
-uint32
-WaitKindAsEvents(WKSockWaitKind wait_kind)
-{
-	uint32 return_val;
-
-	switch (wait_kind)
-	{
-		case WANTS_NO_WAIT:
-			return_val = WL_NO_EVENTS;
-			break;
-		case WANTS_SOCK_READ:
-			return_val = WL_SOCKET_READABLE;
-			break;
-		case WANTS_SOCK_WRITE:
-			return_val = WL_SOCKET_WRITEABLE;
-			break;
-		case WANTS_SOCK_EITHER:
-			return_val = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-	}
-
-	return return_val;
-}
-
 /* Returns a human-readable string corresonding to the WalKeeperState
  *
  * The string should not be freed.
@@ -66,14 +36,15 @@ WaitKindAsEvents(WKSockWaitKind wait_kind)
 char*
 FormatWalKeeperState(WalKeeperState state)
 {
-	char* return_val;
+	char* return_val = NULL;
 
 	switch (state)
 	{
 		case SS_OFFLINE:
 			return_val = "offline";
 			break;
-		case SS_CONNECTING:
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
 			return_val = "connecting";
 			break;
 		case SS_EXEC_STARTWALPUSH:
@@ -103,39 +74,111 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_SEND_WAL:
 			return_val = "WAL-sending";
 			break;
+		case SS_SEND_WAL_FLUSH:
+			return_val = "WAL-sending (flushing)";
+			break;
 		case SS_RECV_FEEDBACK:
 			return_val = "WAL-feedback-receiving";
 			break;
 	}
 
+	Assert(return_val != NULL);
+
 	return return_val;
 }
 
-/* Returns a human-readable string corresponding to the WKSockWaitKind
+/* Asserts that the provided events are expected for given WAL keeper's state */
+void
+AssertEventsOkForState(uint32 events, WalKeeper* wk)
+{
+	uint32 expected = WalKeeperStateDesiredEvents(wk->state);
+
+	/* The events are in-line with what we're expecting, under two conditions:
+	 *   (a) if we aren't expecting anything, `events` has no read- or
+	 *       write-ready component.
+	 *   (b) if we are expecting something, there's overlap
+	 *       (i.e. `events & expected != 0`)
+	 */
+	bool events_ok_for_state; /* long name so the `Assert` is more clear later */
+
+	if (expected == WL_NO_EVENTS)
+		events_ok_for_state = ((events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)) == 0);
+	else
+		events_ok_for_state = ((events & expected) != 0);
+
+	if (!events_ok_for_state)
+	{
+		/* To give a descriptive message in the case of failure, we use elog and
+		 * then an assertion that's guaranteed to fail. */
+		elog(WARNING, "events %s mismatched for walkeeper %s:%s in state [%s]",
+			 FormatEvents(events), wk->host, wk->port, FormatWalKeeperState(wk->state));
+		Assert(events_ok_for_state);
+	}
+}
+
+/* Returns the set of events a WAL keeper in this state should be waiting on
  *
- * The string should not be freed. */
-char*
-FormatWKSockWaitKind(WKSockWaitKind wait_kind)
+ * This will return WL_NO_EVENTS (= 0) for some events. */
+uint32
+WalKeeperStateDesiredEvents(WalKeeperState state)
 {
-	char* return_val;
+	uint32 result;
 
-	switch (wait_kind)
+	/* If the state doesn't have a modifier, we can check the base state */
+	switch (state)
 	{
-		case WANTS_NO_WAIT:
-			return_val = "<no events>";
+		/* Connecting states say what they want in the name */
+		case SS_CONNECTING_READ:
+			result = WL_SOCKET_READABLE;
 			break;
-		case WANTS_SOCK_READ:
-			return_val = "<read event>";
+		case SS_CONNECTING_WRITE:
+			result = WL_SOCKET_WRITEABLE;
 			break;
-		case WANTS_SOCK_WRITE:
-			return_val = "<write event>";
+
+		/* Reading states need the socket to be read-ready to continue */
+		case SS_WAIT_EXEC_RESULT:
+		case SS_HANDSHAKE_RECV:
+		case SS_WAIT_VERDICT:
+		case SS_RECV_FEEDBACK:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* Most writing states don't require any socket conditions */
+		case SS_EXEC_STARTWALPUSH:
+		case SS_HANDSHAKE_SEND:
+		case SS_SEND_VOTE:
+		case SS_SEND_WAL:
+			result = WL_NO_EVENTS;
 			break;
-		case WANTS_SOCK_EITHER:
-			return_val = "<read or write event>";
+		/* but flushing does require read- or write-ready */
+		case SS_SEND_WAL_FLUSH:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
+		/* Idle states use read-readiness as a sign that the connection has been
+		 * disconnected. */
+		case SS_VOTING:
+		case SS_IDLE:
+			result = WL_SOCKET_READABLE;
+			break;
+
+		/* The offline state expects no events. */
+		case SS_OFFLINE:
+			result = WL_NO_EVENTS;
 			break;
 	}
 
-	return return_val;
+	return result;
+}
+
+/* Returns whether the WAL keeper state corresponds to something that should be
+ * immediately executed -- i.e. it is not idle, and is not currently waiting. */
+bool
+StateShouldImmediatelyExecute(WalKeeperState state)
+{
+	/* This is actually pretty simple to determine. */
+	return WalKeeperStateDesiredEvents(state) == WL_NO_EVENTS
+		&& state != SS_OFFLINE;
 }
 
 /* Returns a human-readable string corresponding to the event set
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index af4d877963d..6741e9f82dd 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -22,11 +22,7 @@
  * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
  * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
  */
-#ifndef WL_NO_EVENTS
 #define WL_NO_EVENTS 0
-#else
-#error "WL_NO_EVENTS already defined"
-#endif
 
 extern char* wal_acceptors_list;
 extern int   wal_acceptor_reconnect_timeout;
@@ -46,9 +42,9 @@ typedef enum
 {
 	/* The full read was successful. buf now points to the data */
 	PG_ASYNC_READ_SUCCESS,
-	/* The read is ongoing. Wait until the connection is read-ready, then
-	 * call PQconsumeInput and try again. */
-	PG_ASYNC_READ_CONSUME_AND_TRY_AGAIN,
+	/* The read is ongoing. Wait until the connection is read-ready, then try
+	 * again. */
+	PG_ASYNC_READ_TRY_AGAIN,
 	/* Reading failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_READ_FAIL,
 } PGAsyncReadResult;
@@ -58,9 +54,6 @@ typedef enum
 {
 	/* The write fully completed */
 	PG_ASYNC_WRITE_SUCCESS,
-	/* There wasn't space in the buffers to queue the data; wait until the
-	 * socket is write-ready and try again. */
-	PG_ASYNC_WRITE_WOULDBLOCK,
 	/* The write started, but you'll need to call PQflush some more times
 	 * to finish it off. We just tried, so it's best to wait until the
 	 * connection is read- or write-ready to try again.
@@ -73,98 +66,109 @@ typedef enum
 	PG_ASYNC_WRITE_FAIL,
 } PGAsyncWriteResult;
 
-/* WAL safekeeper state - high level */
+/*
+ * WAL safekeeper state
+ *
+ * States are listed here in the order that they're executed - with the only
+ * exception occuring from the "send WAL" cycle, which loops as:
+ *
+ *   SS_IDLE -> SS_SEND_WAL (+ flush) -> SS_RECV_FEEDBACK -> SS_IDLE/SS_SEND_WAL
+ *
+ * Most states, upon failure, will move back to SS_OFFLINE by calls to
+ * ResetConnection or ShutdownConnection.
+ *
+ * Also note: In places we say that a state "immediately" moves to another. This
+ * happens in states that only exist to execute program logic, so they run
+ * exactly once (when moved into), without waiting for any socket conditions.
+ *
+ * For example, when we set a WalKeeper's state to SS_SEND_VOTE, we immediately
+ * call AdvancePollState - during which the WalKeeper switches its state to
+ * SS_WAIT_VERDICT.
+ */
 typedef enum
 {
 	/*
 	 * Does not have an active connection and will stay that way until
-	 * further notice. May be paired with:
-	 *   - SPOLL_NONE
+	 * further notice.
 	 *
-	 * Moves to SS_CONNECTING only by calls to ResetConnection.
+	 * Moves to SS_CONNECTING_WRITE by calls to ResetConnection.
 	 */
 	SS_OFFLINE,
+
 	/*
-	 * Currently in the process of connecting. May be paired with:
-	 *   - SPOLL_CONNECT
+	 * Connecting states. "_READ" waits for the socket to be available for
+	 * reading, "_WRITE" waits for writing. There's no difference in the code
+	 * they execute when polled, but we have this distinction in order to
+	 * recreate the event set in HackyRemoveWalProposerEvent.
 	 *
 	 * After the connection is made, moves to SS_EXEC_STARTWALPUSH.
 	 */
-	SS_CONNECTING,
+	SS_CONNECTING_WRITE,
+	SS_CONNECTING_READ,
+
 	/*
-	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper. May be paired with:
-	 *   - SPOLL_NONE
-	 *   - SPOLL_WRITE_PQ_FLUSH
-	 *
-	 * After the query sends, moves to SS_WAIT_EXEC_RESULT.
+	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper.
+	 * Performs a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT.
 	 */
 	SS_EXEC_STARTWALPUSH,
 	/*
-	 * Waiting for the result of the "START_WAL_PUSH" command. May be paired with:
-	 *   - SPOLL_PQ_CONSUME_AND_RETRY
-	 *
-	 * We only pair with PQconsumeInput because we *need* to wait until the socket is open for
-	 * reading to try again.
+	 * Waiting for the result of the "START_WAL_PUSH" command.
 	 *
 	 * After we get a successful result, moves to SS_HANDSHAKE_SEND.
 	 */
 	SS_WAIT_EXEC_RESULT,
+
 	/*
-	 * Executing the sending half of the handshake. May be paired with:
-	 *   - SPOLL_WRITE_PQ_FLUSH if it hasn't finished sending,
-	 *   - SPOLL_RETRY          if buffers are full and we just need to try again,
-	 *   - SPOLL_NONE
-	 *
-	 * After sending, moves to SS_HANDSHAKE_RECV.
+	 * Executing the sending half of the handshake. Performs the blocking send,
+	 * then immediately moves to SS_HANDSHAKE_RECV.
 	 */
 	SS_HANDSHAKE_SEND,
 	/*
-	 * Executing the receiving half of the handshake. May be paired with:
-	 *   - SPOLL_PQ_CONSUME_AND_RETRY if we need more input
-	 *   - SPOLL_NONE
-	 *
-	 * After receiving, moves to SS_VOTING.
+	 * Executing the receiving half of the handshake. After receiving, moves to
+	 * SS_VOTING.
 	 */
 	SS_HANDSHAKE_RECV,
+
 	/*
-	 * Currently participating in voting, but a quorum hasn't yet been reached. Idle state. May be
-	 * paired with:
-	 *   - SPOLL_IDLE
+	 * Currently participating in voting, but a quorum hasn't yet been reached.
+	 * This is an idle state - we do not expect AdvancePollState to be called.
 	 *
-	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of SS_HANDSHAKE_RECV.
+	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of
+	 * SS_HANDSHAKE_RECV.
 	 */
 	SS_VOTING,
 	/*
-	 * Currently sending the assigned vote
+	 * Performs a blocking send of the assigned vote, then immediately moves to
+	 * SS_WAIT_VERDICT.
 	 */
 	SS_SEND_VOTE,
 	/*
-	 * Sent voting information, waiting to receive confirmation from the node. May be paired with:
-	 *   - SPOLL_WRITE_PQ_FLUSH
-	 *
-	 * After receiving, moves to SS_IDLE.
+	 * Already sent voting information, waiting to receive confirmation from the
+	 * node. After receiving, moves to SS_IDLE.
 	 */
 	SS_WAIT_VERDICT,
+
 	/*
-	 * Waiting for quorum to send WAL. Idle state. May be paired with:
-	 *  - SPOLL_IDLE
+	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
+	 * read-ready, the connection has been closed.
 	 *
 	 * Moves to SS_SEND_WAL only by calls to SendMessageToNode.
 	 */
 	SS_IDLE,
 	/*
-	 * Currently sending the message at currMsg. This state is only ever reached through calls to
-	 * SendMessageToNode. May be paired with:
-	 *   - SPOLL_WRITE_PQ_FLUSH
-	 *   - SPOLL_NONE
+	 * Start sending the message at currMsg. This state is only ever reached
+	 * through calls to SendMessageToNode.
 	 *
-	 * After sending, moves to SS_RECV_FEEDBACK.
+	 * Sending needs to flush; immediately moves to SS_SEND_WAL_FLUSH.
 	 */
 	SS_SEND_WAL,
 	/*
-	 * Currently reading feedback from sending the WAL. May be paired with:
-	 *   - SPOLL_PQ_CONSUME_AND_RETRY
-	 *   - SPOLL_NONE
+	 * Flush the WAL message, repeated until successful. On success, moves to
+	 * SS_RECV_FEEDBACK.
+	 */
+	SS_SEND_WAL_FLUSH,
+	/*
+	 * Currently reading feedback from sending the WAL.
 	 *
 	 * After reading, moves to (SS_SEND_WAL or SS_IDLE) by calls to
 	 * SendMessageToNode.
@@ -172,86 +176,6 @@ typedef enum
 	SS_RECV_FEEDBACK,
 } WalKeeperState;
 
-/* WAL safekeeper state - individual level
- *
- * This type encompasses the type of polling necessary to move on to the
- * next `WalKeeperState` from the current. It's things like "we need to
- * call PQflush some more", or "retry the current operation".
- */
-typedef enum
-{
-	/*
-	 * The current state is the one we want to be in; we just haven't run
-	 * the code for it. It should be processed with AdvancePollState to
-	 * start to advance to the next state.
-	 *
-	 * Expected WKSockWaitKind: WANTS_NO_WAIT.
-	 *
-	 * Note! This polling state is different from the others: its attached
-	 * WalKeeperState is what *will* be executed, not what just was.
-	 */
-	SPOLL_NONE,
-	/*
-	 * We need to retry the operation once the socket permits it
-	 *
-	 * Expected WKSockWaitKind: Any of WANTS_SOCK_READ, WANTS_SOCK_WRITE,
-	 * WANTS_SOCK_EITHER -- operation dependent.
-	 */
-	SPOLL_RETRY,
-	/*
-	 * Marker for states that do not expect to be advanced by calls to AdvancePollState. Not to be
-	 * confused with SS_IDLE, which carries a different (but related) meaning.
-	 *
-	 * For this polling state, we interpret any read-readiness on the socket as an indication that
-	 * the connection has closed normally.
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_READ
-	 */
-	SPOLL_IDLE,
-	/*
-	 * We need to repeat calls to PQconnectPoll. This is only available for
-	 * SS_CONNECTING
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_READ or WANTS_SOCK_WRITE
-	 */
-	SPOLL_CONNECT,
-	/* Poll with PQflush, finishing up a call to WritePGAsync. Always
-	 * combined with writing states, like SS_HANDSHAKE_SEND or SS_SEND_WAL.
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_EITHER
-	 */
-	SPOLL_WRITE_PQ_FLUSH,
-	/*
-	 * Get input with PQconsumeInput and try the operation again. This is
-	 * always combined with reading states -- like SS_HANDSHAKE_RECV or
-	 * SS_WAIT_VERDICT, and the operation repetition helps to reduce the
-	 * amount of repeated logic.
-	 *
-	 * Expected WKSockWaitKind: WANTS_SOCK_READ
-	 */
-	SPOLL_PQ_CONSUME_AND_RETRY,
-} WalKeeperPollState;
-
-/* The state of the socket that we're waiting on. This is used to
- * double-check for polling that the socket we're being handed is correct.
- *
- * Used in the sockWaitState field of WalKeeper, in combination with the
- * WalKeeperPollState.
- *
- * Each polling state above lists the set of values that they accept. */
-typedef enum
-{
-	/* No waiting is required for the poll state */
-	WANTS_NO_WAIT,
-	/* Polling should resume only once the socket is ready for reading */
-	WANTS_SOCK_READ,
-	/* Polling should resume only once the socket is ready for writing */
-	WANTS_SOCK_WRITE,
-	/* Polling should resume once the socket is ready for reading or
-	 * writing */
-	WANTS_SOCK_EITHER,
-} WKSockWaitKind;
-
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
@@ -379,15 +303,19 @@ typedef struct WalKeeper
 	char const*        host;
 	char const*        port;
 	char               conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
-	WalProposerConn*   conn;          /* postgres protocol connection to the walreceiver */
+
+	/*
+	 * postgres protocol connection to the WAL acceptor
+	 *
+	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
+	 * reach SS_SEND_WAL; not before.
+	 */
+	WalProposerConn*   conn;
 
 	WalMessage*        currMsg;       /* message been send to the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
 	WalKeeperState     state;         /* walkeeper state machine state */
-	WalKeeperPollState pollState;     /* what kind of polling is necessary to advance `state` */
-	WKSockWaitKind     sockWaitState; /* what state are we expecting the socket to be in for
-									     the polling required? */
 	AcceptorGreeting   greet;         /* acceptor greeting  */
 	VoteResponse	   voteResponse;  /* the vote */
 	AppendResponse  feedback;      /* feedback to master */
@@ -395,9 +323,10 @@ typedef struct WalKeeper
 
 
 int        CompareLsn(const void *a, const void *b);
-uint32     WaitKindAsEvents(WKSockWaitKind wait_kind);
 char*      FormatWalKeeperState(WalKeeperState state);
-char*      FormatWKSockWaitKind(WKSockWaitKind wait_kind);
+void       AssertEventsOkForState(uint32 events, WalKeeper* wk);
+uint32     WalKeeperStateDesiredEvents(WalKeeperState state);
+bool       StateShouldImmediatelyExecute(WalKeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
@@ -442,8 +371,8 @@ typedef enum
 	 *
 	 * Do not expect PQerrorMessage to be appropriately set. */
 	WP_EXEC_UNEXPECTED_SUCCESS,
-	/* No result available at this time. Wait until read-ready, call PQconsumeInput, then try again.
-	 * Internally, this is returned when PQisBusy indicates that PQgetResult would block. */
+	/* No result available at this time. Wait until read-ready, then call again. Internally, this is
+	 * returned when PQisBusy indicates that PQgetResult would block. */
 	WP_EXEC_NEEDS_INPUT,
 	/* Catch-all failure. Check PQerrorMessage. */
 	WP_EXEC_FAILED,
@@ -476,23 +405,17 @@ typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
 /* Re-exported PQconectPoll */
 typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
 
-/* Re-exported PQsendQuery */
+/* Blocking wrapper around PQsendQuery */
 typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
 
-/* Wrapper around PQisBusy + PQgetResult */
+/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
 typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
 
-/* Re-exported PQsetnonblocking */
-typedef int (*walprop_set_nonblocking_fn) (WalProposerConn* conn, int arg);
-
 /* Re-exported PQsocket */
 typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
 
-/* Re-exported PQflush */
-typedef int (*walprop_flush_fn) (WalProposerConn* conn);
-
-/* Re-exported PQconsumeInput */
-typedef int (*walprop_consume_input_fn) (WalProposerConn* conn);
+/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
+typedef int (*walprop_flush_fn) (WalProposerConn* conn, bool socket_read_ready);
 
 /* Re-exported PQfinish */
 typedef void (*walprop_finish_fn) (WalProposerConn* conn);
@@ -507,9 +430,9 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn);
  * protocol with the walkeepers, so it should not be used as-is for any
  * other purpose.
  *
- * Note: If possible, using <ReadPGAsyncIntoValue> is generally preferred,
- * because it performs a bit of extra checking work that's always required
- * and is normally somewhat verbose.
+ * Note: If possible, using <AsyncRead> is generally preferred, because it
+ * performs a bit of extra checking work that's always required and is normally
+ * somewhat verbose.
  */
 typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
 													char** buf,
@@ -526,6 +449,13 @@ typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
 													  void const* buf,
 													  size_t size);
 
+/*
+ * Blocking equivalent to walprop_async_write_fn
+ *
+ * Returns 'true' if successful, 'false' on failure.
+ */
+typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size);
+
 /* All libpqwalproposer exported functions collected together. */
 typedef struct WalProposerFunctionsType
 {
@@ -535,13 +465,12 @@ typedef struct WalProposerFunctionsType
 	walprop_connect_poll_fn		walprop_connect_poll;
 	walprop_send_query_fn		walprop_send_query;
 	walprop_get_query_result_fn	walprop_get_query_result;
-	walprop_set_nonblocking_fn  walprop_set_nonblocking;
 	walprop_socket_fn			walprop_socket;
 	walprop_flush_fn			walprop_flush;
-	walprop_consume_input_fn	walprop_consume_input;
 	walprop_finish_fn			walprop_finish;
 	walprop_async_read_fn		walprop_async_read;
 	walprop_async_write_fn		walprop_async_write;
+	walprop_blocking_write_fn   walprop_blocking_write;
 } WalProposerFunctionsType;
 
 /* Allow the above functions to be "called" with normal syntax */
@@ -561,16 +490,16 @@ typedef struct WalProposerFunctionsType
 	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
 #define walprop_socket(conn) \
 	WalProposerFunctions->walprop_socket(conn)
-#define walprop_flush(conn) \
-	WalProposerFunctions->walprop_flush(conn)
-#define walprop_consume_input(conn) \
-	WalProposerFunctions->walprop_consume_input(conn)
+#define walprop_flush(conn, consume_input) \
+	WalProposerFunctions->walprop_flush(conn, consume_input)
 #define walprop_finish(conn) \
 	WalProposerFunctions->walprop_finish(conn)
 #define walprop_async_read(conn, buf, amount) \
 	WalProposerFunctions->walprop_async_read(conn, buf, amount)
 #define walprop_async_write(conn, buf, size) \
 	WalProposerFunctions->walprop_async_write(conn, buf, size)
+#define walprop_blocking_write(conn, buf, size) \
+	WalProposerFunctions->walprop_blocking_write(conn, buf, size)
 
 /*
  * The runtime location of the libpqwalproposer functions.

From 7d06d8fa11893cc179ee72173aac797b99acc5de Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Tue, 31 Aug 2021 22:15:21 +0300
Subject: [PATCH 041/167] pass tenant id in connection string to safekeeper

---
 src/backend/replication/walproposer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 1bbe5f30b3a..d612e33331d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -196,8 +196,8 @@ ResetConnection(int i)
 	if (wk->conninfo[0] == '\0')
 	{
 		sprintf((char*) &wk->conninfo,
-				"host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
-				wk->host, wk->port, zenith_timeline_walproposer);
+				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	}
 
 	wk->conn = walprop_connect_start((char*) &wk->conninfo);
@@ -658,8 +658,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 
-	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s'",
-			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer);
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{

From 6a45e630bf4d854d84a5ad96ae1ef57e5411ab94 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 31 Aug 2021 18:13:11 +0300
Subject: [PATCH 042/167] Ask pageserver only with LSN's aligned on record
 boundary.

Now pageserver tracks only last_record_lsn and ignores
last_valids_lsn. We can cause deadlock at start or extreme slowness
during the normal work if we call get_page with LSN of incomplete
record.

Patch by @knizhnik
---
 contrib/zenith/pagestore_smgr.c   | 24 ++++++------------------
 src/backend/access/transam/xlog.c |  1 +
 2 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 5db79710d68..569d1c330d8 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -417,7 +417,6 @@ static XLogRecPtr
 zenith_get_request_lsn(bool nonrel)
 {
 	XLogRecPtr lsn;
-	XLogRecPtr flushlsn;
 
 	if (RecoveryInProgress())
 	{
@@ -434,12 +433,12 @@ zenith_get_request_lsn(bool nonrel)
 	}
 	else if (nonrel)
 	{
-		lsn = GetFlushRecPtr();
-		elog(DEBUG1, "zenith_get_request_lsn norel GetFlushRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
+		lsn = GetLastImportantRecPtr();
+		elog(DEBUG1, "zenith_get_request_lsn norel GetLastImportantRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
 	}
 	else
 	{
-		flushlsn = GetFlushRecPtr();
+		XLogRecPtr flushlsn;
 
 		/*
 		 * Use the latest LSN that was evicted from the buffer cache. Any
@@ -447,29 +446,18 @@ zenith_get_request_lsn(bool nonrel)
 		 * so our request cannot concern those.
 		 */
 		lsn = GetLastWrittenPageLSN();
+		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			(uint32) ((lsn) >> 32), (uint32) (lsn));
 
-		if (lsn == InvalidXLogRecPtr)
-		{
-			/*
-			 * We haven't evicted anything yet since the server was
-			 * started. Then just use the latest flushed LSN. That's always
-			 * safe, using the latest evicted LSN is really just an
-			 * optimization.
-			 */
-			lsn = flushlsn;
-			elog(DEBUG1, "zenith_get_request_lsn GetFlushRecPtr lsn %X/%X",
-				 (uint32) ((lsn) >> 32), (uint32) (lsn));
-		}
-		else
-			lsn = zm_adjust_lsn(lsn);
+		lsn = zm_adjust_lsn(lsn);
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
 		 * we shouldn't evict a page from the buffer cache before all its modifications have
 		 * been safely flushed. That's the "WAL before data" rule. But better safe than sorry.
 		 */
+		flushlsn = GetFlushRecPtr();
 		if (lsn > flushlsn)
 		{
 			elog(LOG, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c0e5593eba6..23ee3edbc5d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7956,6 +7956,7 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
+	XLogCtl->lastWrittenPageLSN = EndOfLog;
 
 	LocalSetXLogInsertAllowed();
 

From ffa97f05c72b5e530aebc59003750bc758517451 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 1 Sep 2021 23:17:17 +0300
Subject: [PATCH 043/167] [refer #506] Correctly initialize all fields of WAL
 page header for first WAL record of started compute node

---
 src/backend/access/transam/xlog.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 23ee3edbc5d..beb114c16a5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7053,6 +7053,10 @@ StartupXLOG(void)
 			EndRecPtr = RecPtr = checkPoint.redo;
 			skipLastRecordReread = true;
 			close(fd);
+
+			elog(LOG,
+				"[ZENITH] found 'zenith.signal' file. Setting prevRecPtr to %X/%X",
+				LSN_FORMAT_ARGS(prevRecPtr));
 		}
 		else
 		{
@@ -7732,11 +7736,15 @@ StartupXLOG(void)
 	 */
 	if (skipLastRecordReread)
 	{
-		XLogRecPtr lastPage = EndRecPtr - (EndRecPtr % XLOG_BLCKSZ);
+		int offs = (EndRecPtr % XLOG_BLCKSZ);
+		XLogRecPtr lastPage = EndRecPtr - offs;
 		int idx = XLogRecPtrToBufIdx(lastPage);
 		XLogPageHeader xlogPageHdr = (XLogPageHeader)(XLogCtl->pages + idx*XLOG_BLCKSZ);
 		xlogPageHdr->xlp_pageaddr = lastPage;
 		xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+		xlogPageHdr->xlp_tli = ThisTimeLineID;
+		xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD;
+		xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
 		readOff = XLogSegmentOffset(lastPage, wal_segment_size);
 		elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
 	}

From 3600ccee45681ef421012fb54a1cba58a86da734 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 19 Aug 2021 19:55:38 +0300
Subject: [PATCH 044/167] Add --sync-safekeepers starting standalone
 walproposer to sync safekeepers (#439).

It is intended to solve the following problems:

a) Chicken-or-the-egg one: compute postgres needs data directory
   with non-rel files that are downloaded from pageserver by calling
   basebackup@LSN. This LSN is not arbitrary, it must include all
   previously committed transactions and defined through consensus
   voting, which happens... in walproposer, a part of compute node.

b) Just warranting such LSN is not enough, we must also actually commit
   it and make sure there is a safekeeper who knows this LSN is
   committed so WAL before it can be streamed to pageserver -- otherwise
   basebackup will hang waiting for WAL. Advancing commit_lsn without
   playing consensus game is impossible, so speculative 'let's just poll
   safekeepers, learn start LSN of future epoch and run basebackup'
   won't work.

Currently --sync-safekeepers is considered completed when 1) at least majority
of safekeepers and 2) *all* safekeepers with live connection to walproposer
switch to new epoch and advance commit_lsn allowing basebackup to proceed. 2)
limits availablity, but that's because currently we don't have a mechanism
defining which safekeeper should stream WAL into pageserver.
---
 src/backend/main/main.c               |   3 +
 src/backend/replication/walproposer.c | 302 +++++++++++++++++++++-----
 src/include/replication/walproposer.h |   9 +-
 3 files changed, 254 insertions(+), 60 deletions(-)

diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 51c9dfedabc..eb98fca066f 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -35,6 +35,7 @@
 #include "common/username.h"
 #include "port/atomics.h"
 #include "postmaster/postmaster.h"
+#include "replication/walproposer.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/help_config.h"
@@ -209,6 +210,8 @@ main(int argc, char *argv[])
 		WalRedoMain(argc, argv,
 					 NULL,		/* no dbname */
 					 strdup(get_user_name_or_exit(progname)));	/* does not return */
+	else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0)
+		WalProposerSync(argc, argv);
 	else
 		PostmasterMain(argc, argv); /* does not return */
 	abort();					/* should not get here */
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d612e33331d..06b227244f9 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -2,7 +2,36 @@
  *
  * walproposer.c
  *
- * Broadcast WAL stream to Zenith WAL acceptetors
+ * Proposer/leader part of the total order broadcast protocol between postgres
+ * and WAL safekeepers.
+ *
+ * We have two ways of launching WalProposer:
+ *
+ *   1. As a background worker which will run physical WalSender with
+ *      am_wal_proposer flag set to true. WalSender in turn would handle WAL
+ *      reading part and call WalProposer when ready to scatter WAL.
+ *
+ *   2. As a standalone utility by running `postgres --sync-safekeepers`. That
+ *      is needed to create LSN from which it is safe to start postgres. More
+ *      specifically it addresses following problems:
+ *
+ *      a) Chicken-or-the-egg problem: compute postgres needs data directory
+ *         with non-rel files that are downloaded from pageserver by calling
+ *         basebackup@LSN. This LSN is not arbitrary, it must include all
+ *         previously committed transactions and defined through consensus
+ *         voting, which happens... in walproposer, a part of compute node.
+ *
+ *      b) Just warranting such LSN is not enough, we must also actually commit
+ *         it and make sure there is a safekeeper who knows this LSN is
+ *         committed so WAL before it can be streamed to pageserver -- otherwise
+ *         basebackup will hang waiting for WAL. Advancing commit_lsn without
+ *         playing consensus game is impossible, so speculative 'let's just poll
+ *         safekeepers, learn start LSN of future epoch and run basebackup'
+ *         won't work.
+ *
+ *      TODO: check that LSN on safekeepers after start is the same as it was
+ *            after `postgres --sync-safekeepers`.
+ *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
@@ -18,6 +47,7 @@
 #include "replication/walreceiver.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
+#include "postmaster/postmaster.h"
 #include "storage/pmsignal.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
@@ -29,7 +59,6 @@ char* wal_acceptors_list;
 int   wal_acceptor_reconnect_timeout;
 bool  am_wal_proposer;
 
-
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
 WalProposerFunctionsType* WalProposerFunctions = NULL;
 
@@ -45,7 +74,11 @@ static XLogRecPtr	lastSentCommitLsn;	/* last commitLsn broadcast to walkeepers *
 static ProposerGreeting   proposerGreeting;
 static WaitEventSet* waitEvents;
 static AppendResponse lastFeedback;
-static XLogRecPtr   truncateLsn; /* Last position received by all walkeepers. */
+/*
+ *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
+ *  + 1 of last chunk streamed to everyone)
+ */
+static XLogRecPtr   truncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t       propTerm; /* term of the proposer */
 static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
@@ -55,6 +88,9 @@ static int          n_votes = 0;
 static int          n_connected = 0;
 static TimestampTz  last_reconnect_attempt;
 
+/* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
+static bool         syncSafekeepers;
+
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
 static bool AsyncRead(int i, void* value, size_t value_size);
@@ -62,6 +98,9 @@ static bool BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState succ
 static bool AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
 static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
 static void HackyRemoveWalProposerEvent(int to_remove);
+static WalMessage* CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void BroadcastMessage(WalMessage* msg);
+
 
 /*
  * Combine hot standby feedbacks from all walkeepers.
@@ -277,23 +316,27 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr minQuorumLsn;
+	int i;
+	int n_synced;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
 	{
 		lastFeedback.flushLsn = minQuorumLsn;
 		/* advance the replication slot */
-		ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+		if (!syncSafekeepers)
+			ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
 	}
 	CombineHotStanbyFeedbacks(&hsFeedback);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
 	{
 		lastFeedback.hs = hsFeedback;
-		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+		if (!syncSafekeepers)
+			ProcessStandbyHSFeedback(hsFeedback.ts,
+									 XidFromFullTransactionId(hsFeedback.xmin),
+									 EpochFromFullTransactionId(hsFeedback.xmin),
+									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 
 
@@ -312,26 +355,52 @@ HandleWalKeeperResponse(void)
 	}
 	if (!msgQueueHead) /* queue is empty */
 		msgQueueTail = NULL;
+
+	/*
+	 * Generally sync is done when majority switched the epoch so we committed
+	 * epochStartLsn and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is alive,
+	 * there will be at least one safekeeper who is able to stream WAL to
+	 * pageserver to make basebackup possible. However, since at the moment we
+	 * don't have any good mechanism of defining the healthy and most advanced
+	 * safekeeper who should push the wal into pageserver and basically the
+	 * random one gets connected, to prevent hanging basebackup (due to
+	 * pageserver connecting to not-synced-walkeeper) we currently wait for all
+	 * seemingly alive walkeepers to get synced.
+	 */
+	if (syncSafekeepers)
+	{
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			WalKeeper *wk = &walkeeper[i];
+			bool synced = wk->feedback.commitLsn >= propEpochStartLsn;
+
+			/* alive safekeeper which is not synced yet; wait for it */
+			if (wk->state != SS_OFFLINE && !synced)
+				return;
+			if (synced)
+				n_synced++;
+		}
+		if (n_synced >= quorum)
+		{
+			/* All walkeepers synced! */
+			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+			exit(0);
+		}
+	}
 }
 
 char *zenith_timeline_walproposer = NULL;
 char *zenith_tenant_walproposer = NULL;
 
-/*
- * WAL proposer bgworeker entry point
- */
-void
-WalProposerMain(Datum main_arg)
+
+static void
+WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 {
 	char* host;
 	char* sep;
 	char* port;
 
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
 	/* Load the libpq-specific functions */
 	load_file("libpqwalproposer", false);
 	if (WalProposerFunctions == NULL)
@@ -340,11 +409,8 @@ WalProposerMain(Datum main_arg)
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-
 	load_file("zenith", false);
 
-	BackgroundWorkerUnblockSignals();
-
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
@@ -374,14 +440,12 @@ WalProposerMain(Datum main_arg)
 	}
 	quorum = n_walkeepers/2 + 1;
 
-	GetXLogReplayRecPtr(&ThisTimeLineID);
-
 	/* Fill the greeting package */
 	proposerGreeting.tag = 'g';
 	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
 	proposerGreeting.pgVersion = PG_VERSION_NUM;
 	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
-	proposerGreeting.systemId = GetSystemIdentifier();
+	proposerGreeting.systemId = systemId;
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
@@ -395,13 +459,52 @@ WalProposerMain(Datum main_arg)
 	proposerGreeting.timeline = ThisTimeLineID;
 	proposerGreeting.walSegSize = wal_segment_size;
 
+	InitEventSet();
+}
+
+static void
+WalProposerLoop(void)
+{
+	while (true)
+		WalProposerPoll();
+}
+
+static void
+WalProposerStart(void)
+{
+
+	/* Initiate connections to all walkeeper nodes */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		ResetConnection(i);
+	}
+
+	WalProposerLoop();
+}
+
+/*
+ * WAL proposer bgworeker entry point
+ */
+void
+WalProposerMain(Datum main_arg)
+{
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
+
 	last_reconnect_attempt = GetCurrentTimestamp();
 
 	application_name = (char *) "walproposer"; /* for synchronous_standby_names */
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
-	InitEventSet();
 
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
@@ -410,14 +513,54 @@ WalProposerMain(Datum main_arg)
 		ReplicationSlotRelease();
 	}
 
-	/* Initiate connections to all walkeeper nodes */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		ResetConnection(i);
-	}
+	WalProposerStart();
+}
 
-	while (true)
-		WalProposerPoll();
+void
+WalProposerSync(int argc, char *argv[])
+{
+	syncSafekeepers = true;
+
+	InitStandaloneProcess(argv[0]);
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	WalProposerInit(0, 0);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
 }
 
 static void
@@ -458,8 +601,22 @@ SendMessageToNode(int i, WalMessage* msg)
 	/* Only try to send the message if it's non-null */
 	if (wk->currMsg)
 	{
-		wk->currMsg->req.truncateLsn = truncateLsn;
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
+		/*
+		 * truncateLsn is advanced immediately once chunk is broadcast to all
+		 * safekeepers, and commitLsn generally can't be advanced based on
+		 * feedback from safekeeper who is still in the previous epoch (similar
+		 * to 'leader can't commit entries from previous term' in Raft), so the
+		 * first might surprisingly get higher than the latter.
+		 *
+		 * Another reason for this will be switch to proper acks from
+		 * safekeepers: they must point to end of last valid record, not just
+		 * end of last received chunk.
+		 *
+		 * Free safekeeper from such surprises by holding back truncateLsn in
+		 * these cases.
+		 */
+		wk->currMsg->req.truncateLsn = Min(truncateLsn, wk->currMsg->req.commitLsn);
 
 		/* Once we've selected and set up our message, actually start sending it. */
 		wk->state = SS_SEND_WAL;
@@ -539,17 +696,11 @@ WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
  * know that commit lsn has advanced.
  */
 static WalMessage*
-CreateMessageCommitLsnOnly(void)
+CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 {
 	/* Create new message and append it to message queue */
 	WalMessage*	msg;
 
-	if (lastSentLsn == 0)
-	{
-		/* FIXME: We haven't sent anything yet. Not sure what to do then.. */
-		return NULL;
-	}
-
 	msg = (WalMessage*)malloc(sizeof(WalMessage));
 	if (msgQueueTail != NULL)
 		msgQueueTail->next = msg;
@@ -563,8 +714,18 @@ CreateMessageCommitLsnOnly(void)
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
-	msg->req.beginLsn = lastSentLsn;
-	msg->req.endLsn = lastSentLsn;
+	/*
+     * This serves two purposes:
+	 * 1) After all msgs from previous epochs are pushed we queue empty
+     *    WalMessage with lsn set to epochStartLsn which commands to switch the
+     *    epoch, which allows to do the switch without creating new epoch
+     *    records (we especially want to avoid such in --sync mode).
+	 *    Walproposer can advance commit_lsn only after the switch, so this lsn
+	 *    (reported back) also is the first possible advancement point.
+	 * 2) Maintain common invariant of queue entries sorted by LSN.
+	 */
+	msg->req.beginLsn = lsn;
+	msg->req.endLsn = lsn;
 	msg->req.proposerId = proposerGreeting.proposerId;
 	/* truncateLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
 	return msg;
@@ -602,8 +763,9 @@ DetermineEpochStartLsn(void)
 		}
 	}
 
-	elog(LOG, "got votes from majority (%d) of nodes, epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
 		 quorum,
+		 propTerm,
 		 LSN_FORMAT_ARGS(propEpochStartLsn),
 		 walkeeper[donor].host, walkeeper[donor].port,
 		 LSN_FORMAT_ARGS(truncateLsn)
@@ -682,20 +844,34 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	if (walrcv_startstreaming(wrconn, &options))
 	{
 		XLogRecPtr rec_start_lsn;
-		XLogRecPtr rec_end_lsn;
+		XLogRecPtr rec_end_lsn = 0;
 		int len;
 		char *buf;
 		pgsocket wait_fd = PGINVALID_SOCKET;
-		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) > 0)
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
 		{
-			Assert(buf[0] == 'w');
-			memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], sizeof rec_start_lsn);
-			rec_start_lsn = pg_ntoh64(rec_start_lsn);
-			rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-			(void)CreateMessage(rec_start_lsn, buf, len);
-			if (rec_end_lsn >= endpos)
-				break;
+			if (len == 0)
+			{
+				(void) WaitLatchOrSocket(
+					MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+					-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+			}
+			else
+			{
+				Assert(buf[0] == 'w');
+				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
+					   sizeof rec_start_lsn);
+				rec_start_lsn = pg_ntoh64(rec_start_lsn);
+				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+				(void) CreateMessage(rec_start_lsn, buf, len);
+				elog(DEBUG1, "Recover message %X/%X length %d",
+					 LSN_FORMAT_ARGS(rec_start_lsn), len);
+				if (rec_end_lsn >= endpos)
+					break;
+			}
 		}
+		elog(DEBUG1, "end of replication stream at %X/%X: %m",
+			 LSN_FORMAT_ARGS(rec_end_lsn));
 		walrcv_disconnect(wrconn);
 	}
 	else
@@ -1032,6 +1208,20 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
+						/* this message signifies epoch switch */
+						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+
+						if (syncSafekeepers)
+						{
+							/* Wait until all walkeepers are synced */
+							WalProposerLoop();
+						}
+					}
+					else if (syncSafekeepers)
+					{
+						/* Sync is not needed: just exit */
+						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+						exit(0);
 					}
 					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
@@ -1081,7 +1271,6 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* next;
 				XLogRecPtr  minQuorumLsn;
-				WalMessage* commitLsnUpdateMsg;
 
 				/* If our reading doesn't immediately succeed, any necessary error handling or state
 				 * setting is taken care of. We can leave any other work until later. */
@@ -1089,7 +1278,6 @@ AdvancePollState(int i, uint32 events)
 					return;
 
 				next = wk->currMsg->next;
-				Assert(wk->feedback.flushLsn == wk->currMsg->req.endLsn);
 				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
 
 				wk->currMsg = NULL;
@@ -1106,9 +1294,7 @@ AdvancePollState(int i, uint32 events)
 
 				if (minQuorumLsn > lastSentCommitLsn)
 				{
-					commitLsnUpdateMsg = CreateMessageCommitLsnOnly();
-					if (commitLsnUpdateMsg)
-						BroadcastMessage(commitLsnUpdateMsg);
+					BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
 					lastSentCommitLsn = minQuorumLsn;
 				}
 				break;
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 6741e9f82dd..99e62142736 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -244,7 +244,7 @@ typedef struct AppendRequestHeader
 	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
 	/*
 	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
-	 *  + 1 of last record streamed to everyone)
+	 *  + 1 of last chunk streamed to everyone)
 	 */
     XLogRecPtr truncateLsn;
     pg_uuid_t  proposerId; /* for monitoring/debugging */
@@ -289,8 +289,11 @@ typedef struct AppendResponse
 	 */
 	uint64 tag;
 	term_t     term;
-	term_t     epoch;
+	term_t epoch;
 	XLogRecPtr flushLsn;
+	// Safekeeper reports back his awareness about which WAL is committed, as
+	// this is a criterion for walproposer --sync mode exit
+	XLogRecPtr commitLsn;
 	HotStandbyFeedback hs;
 } AppendResponse;
 
@@ -344,6 +347,8 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									TransactionId feedbackCatalogXmin,
 									uint32		feedbackCatalogEpoch);
 void       StartReplication(StartReplicationCmd *cmd);
+void       WalProposerSync(int argc, char *argv[]);
+
 
 /* libpqwalproposer hooks & helper type */
 

From 7e3d867319345856ab215fc3010e57f802df6e7d Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 27 Aug 2021 16:02:14 +0300
Subject: [PATCH 045/167] Update Dockerfile

---
 .circleci/config.yml |  6 ++----
 Dockerfile           | 27 ++++++++++-----------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ad48e5ac396..16a271b0386 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -6,10 +6,8 @@ jobs:
   docker_image:
     docker:
       - image: cimg/base:2021.04
-    working_directory: ~/repo
     steps:
-      - checkout:
-          path: ~/repo
+      - checkout
       - setup_remote_docker:
           docker_layer_caching: true
       - run:
@@ -25,7 +23,7 @@ workflows:
       # Build and push image only for commits to `main`.
       - docker_image:
           # Context gives an ability to login
-          context: 'Docker Hub'
+          context: Docker Hub
           filters:
             branches:
               only:
diff --git a/Dockerfile b/Dockerfile
index 83407413142..4878e3cc755 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Image with pre-built tools
 #
 FROM zenithdb/compute-tools:latest AS compute-deps
-# Only to get ready apply_conf binary as a dep
+# Only to get ready zenith_ctl and apply_conf binaries as deps
 
 #
 # Image with Postgres build deps
@@ -43,32 +43,25 @@ WORKDIR /pg
 FROM debian:buster-slim
 
 # libreadline-dev is required to run psql
-RUN apt-get update && apt-get -yq install openssh-server libreadline-dev && \
-    # This will prepare everything needed by sshd
-    # like generation host keys with ssh-keygen -A
-    service ssh start
+RUN apt-get update && apt-get -yq install libreadline-dev
 
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
     echo "postgres:test_console_pass" | chpasswd && \
     mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres/compute && \
-    chown -R postgres:postgres /var/db/postgres/specs && \
+    chown -R postgres:postgres /var/db/postgres && \
     chmod 0750 /var/db/postgres/compute
 
 # Copy ready Postgres binaries
-COPY --from=pg-build /pg/compute_build/postgres_bin /var/db/postgres/install
+COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
 
-# Copy apply_conf binary
+# Copy binaries from compute-tools
 COPY --from=compute-deps /usr/local/bin/apply_conf /usr/local/bin/apply_conf
+COPY --from=compute-deps /usr/local/bin/zenith_ctl /usr/local/bin/zenith_ctl
 
-# Copy postgres binaries to the common location
-RUN cp /var/db/postgres/install/bin/* /usr/local/bin/ && \
-    cp -r /var/db/postgres/install/share/* /usr/local/share/ && \
-    # Add postgres shared objects to the search path
-    echo '/var/db/postgres/install/lib' >> /etc/ld.so.conf && /sbin/ldconfig
+# Add postgres shared objects to the search path
+RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
-# To be able to run sshd (seems to be default)
-# USER root
+USER postgres
 
-ENTRYPOINT ["/bin/sh"]
+ENTRYPOINT ["/usr/local/bin/zenith_ctl"]

From b661532ad172c0be67c6046c6b618e6e19267f41 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 10 Sep 2021 13:22:08 +0300
Subject: [PATCH 046/167] Fix compiler warnings in walproposer.c

---
 src/backend/replication/walproposer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 06b227244f9..bee12ad21b7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -316,8 +316,7 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr minQuorumLsn;
-	int i;
-	int n_synced;
+	int n_synced = 0;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)

From cc87ff1ff9b71dccbdea4b4ab70d94eff9f2ab99 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 9 Sep 2021 17:30:57 +0300
Subject: [PATCH 047/167] Always advance truncateLsn to commitLsn, keeping it
 on record boundary.

And take initial value from freshly created slot position. Thus proposer always
starts streaming from the record beginning; it simplifies WAL decoding on
safekeeper.
---
 src/backend/replication/walproposer.c | 88 ++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 23 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index bee12ad21b7..0532fb47025 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -79,6 +79,7 @@ static AppendResponse lastFeedback;
  *  + 1 of last chunk streamed to everyone)
  */
 static XLogRecPtr   truncateLsn;
+static XLogRecPtr   candidateTruncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t       propTerm; /* term of the proposer */
 static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
@@ -344,10 +345,30 @@ HandleWalKeeperResponse(void)
 	{
 		WalMessage* msg = msgQueueHead;
 		msgQueueHead = msg->next;
-		if (truncateLsn < msg->req.beginLsn)
+		/*
+		 * This piece is received by everyone; try to advance truncateLsn, but
+		 * hold it back to nearest commitLsn. Thus we will always start
+		 * streaming from the beginning of the record, which simplifies decoding
+		 * on the far end.
+		 *
+		 * This also prevents surprising violation of truncateLsn <= commitLsn
+		 * invariant which might occur because 1) truncateLsn can be advanced
+		 * immediately once chunk is broadcast to all safekeepers, and commitLsn
+		 * generally can't be advanced based on feedback from safekeeper who is
+		 * still in the previous epoch (similar to 'leader can't commit entries
+		 * from previous term' in Raft); 2) chunks we read from WAL and send are
+		 * plain sheets of bytes, but safekeepers ack only on commit boundaries.
+		 */
+		if (msg->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
+		{
+			truncateLsn = minQuorumLsn;
+			candidateTruncateLsn = InvalidXLogRecPtr;
+		}
+		else if (msg->req.endLsn >= candidateTruncateLsn &&
+				 candidateTruncateLsn != InvalidXLogRecPtr)
 		{
-			Assert(truncateLsn < msg->req.endLsn);
-			truncateLsn = msg->req.endLsn;
+			truncateLsn = candidateTruncateLsn;
+			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
@@ -509,6 +530,10 @@ WalProposerMain(Datum main_arg)
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
 		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotReserveWal();
+		/* Write this slot to disk */
+		ReplicationSlotMarkDirty();
+		ReplicationSlotSave();
 		ReplicationSlotRelease();
 	}
 
@@ -601,21 +626,7 @@ SendMessageToNode(int i, WalMessage* msg)
 	if (wk->currMsg)
 	{
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
-		/*
-		 * truncateLsn is advanced immediately once chunk is broadcast to all
-		 * safekeepers, and commitLsn generally can't be advanced based on
-		 * feedback from safekeeper who is still in the previous epoch (similar
-		 * to 'leader can't commit entries from previous term' in Raft), so the
-		 * first might surprisingly get higher than the latter.
-		 *
-		 * Another reason for this will be switch to proper acks from
-		 * safekeepers: they must point to end of last valid record, not just
-		 * end of last received chunk.
-		 *
-		 * Free safekeeper from such surprises by holding back truncateLsn in
-		 * these cases.
-		 */
-		wk->currMsg->req.truncateLsn = Min(truncateLsn, wk->currMsg->req.commitLsn);
+		wk->currMsg->req.truncateLsn = truncateLsn;
 
 		/* Once we've selected and set up our message, actually start sending it. */
 		wk->state = SS_SEND_WAL;
@@ -741,10 +752,9 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 static void
 DetermineEpochStartLsn(void)
 {
-	// FIXME: If the WAL acceptors have nothing, start from "the beginning of time"
-	propEpochStartLsn = wal_segment_size;
+	propEpochStartLsn = InvalidXLogRecPtr;
 	donorEpoch = 0;
-	truncateLsn = wal_segment_size;
+	truncateLsn = InvalidXLogRecPtr;
 
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -762,7 +772,28 @@ DetermineEpochStartLsn(void)
 		}
 	}
 
-	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, restart_lsn %X/%X",
+	/*
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
+	 * committed yet. To keep the idea of always starting streaming since record
+	 * boundary (which simplifies decoding on safekeeper), take start position
+	 * of the slot.
+	 */
+	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
+	{
+		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, SAB_Error);
+		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		ReplicationSlotRelease();
+		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
+	}
+	/*
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to some
+	 * connected safekeeper; it must have carried truncateLsn pointing to the
+	 * first record.
+	 */
+	Assert((truncateLsn != InvalidXLogRecPtr) ||
+		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
+
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		 quorum,
 		 propTerm,
 		 LSN_FORMAT_ARGS(propEpochStartLsn),
@@ -1240,8 +1271,9 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* msg = wk->currMsg;
 
-				elog(LOG, "Sending message with len %ld commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
+				elog(LOG, "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
 					 msg->size - sizeof(AppendRequestHeader),
+					 LSN_FORMAT_ARGS(msg->req.beginLsn),
 					 LSN_FORMAT_ARGS(msg->req.commitLsn),
 					 LSN_FORMAT_ARGS(truncateLsn),
 					 wk->host, wk->port);
@@ -1294,6 +1326,16 @@ AdvancePollState(int i, uint32 events)
 				if (minQuorumLsn > lastSentCommitLsn)
 				{
 					BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+					/*
+					 * commitLsn is always the record boundary; remember it so
+					 * we can advance truncateLsn there. But do so only if
+					 * previous value is applied, otherwise it might never catch
+					 * up.
+					 */
+					if (candidateTruncateLsn == InvalidXLogRecPtr)
+					{
+						candidateTruncateLsn = minQuorumLsn;
+					}
 					lastSentCommitLsn = minQuorumLsn;
 				}
 				break;

From 43dcc39109c4901346fda56fc26feb139383dda8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 10 Sep 2021 21:21:21 +0300
Subject: [PATCH 048/167] Minor logging editing.

---
 src/backend/replication/walproposer.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 0532fb47025..02051e627f7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -861,10 +861,11 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 						err)));
 		return false;
 	}
-	elog(LOG, "Start recovery from %s:%s starting from %X/%08X till %X/%08X timeline %d",
-		 walkeeper[donor].host, walkeeper[donor].port,
-		 (uint32)(startpos>>32), (uint32)startpos, (uint32)(endpos >> 32), (uint32)endpos,
-		 timeline);
+	elog(LOG,
+		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
 	options.startpoint = startpos;
@@ -1017,7 +1018,8 @@ AdvancePollState(int i, uint32 events)
 				switch (result)
 				{
 					case WP_CONN_POLLING_OK:
-						elog(LOG, "Connected with node %s:%s", wk->host, wk->port);
+						elog(LOG, "connected with node %s:%s", wk->host,
+							 wk->port);
 
 						/* Once we're fully connected, we can move to the next state */
 						wk->state = SS_EXEC_STARTWALPUSH;
@@ -1233,8 +1235,11 @@ AdvancePollState(int i, uint32 events)
 					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
 					if (truncateLsn < propEpochStartLsn)
 					{
-						elog(LOG, "start recovery because restart LSN=%X/%X is not equal to epochStartLsn=%X/%X",
-							 LSN_FORMAT_ARGS(truncateLsn), LSN_FORMAT_ARGS(propEpochStartLsn));
+						elog(LOG,
+							 "start recovery because truncateLsn=%X/%X is not "
+							 "equal to epochStartLsn=%X/%X",
+							 LSN_FORMAT_ARGS(truncateLsn),
+							 LSN_FORMAT_ARGS(propEpochStartLsn));
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
@@ -1271,12 +1276,13 @@ AdvancePollState(int i, uint32 events)
 			{
 				WalMessage* msg = wk->currMsg;
 
-				elog(LOG, "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X restart LSN=%X/%X to %s:%s",
+				elog(LOG,
+					 "sending message with len %ld beginLsn=%X/%X "
+					 "commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 					 msg->size - sizeof(AppendRequestHeader),
 					 LSN_FORMAT_ARGS(msg->req.beginLsn),
 					 LSN_FORMAT_ARGS(msg->req.commitLsn),
-					 LSN_FORMAT_ARGS(truncateLsn),
-					 wk->host, wk->port);
+					 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
 				/* We write with msg->size here because the body of the message
 				 * is stored after the end of the WalMessage struct, in the

From 40c109b0b2725e325c99eb8cf63f141b3b35792c Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 10 Sep 2021 21:21:45 +0300
Subject: [PATCH 049/167] Fix walproposer starting streaming point.

Send *all* entries (from the beginning, i.e. truncateLsn) to everyone but donor
who doesn't need recovery at all and will receive only new entries. This can be
optimized to avoid sending data which is already persisted (and correct), but
previous such optimization was incorrect.
---
 src/backend/replication/walproposer.c | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 02051e627f7..f8aed25255b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -912,23 +912,23 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 						timeline, (uint32)(startpos >> 32), (uint32)startpos)));
 		return false;
 	}
-	/* Setup restart point for all walkeepers */
+
+	/*
+	 * Start sending entries to everyone from the beginning (truncateLsn),
+	 * except for donor who doesn't need recovery at all. We could do here
+	 * better, taking into account commitLsn of safekeepers to avoid sending
+	 * them excessive data, but this requires some effort (note also that we
+	 * must always start sending from the beginning of the record).
+	 *
+	 * And note that we definitely can't pick up flushLsn of safekeeper and
+	 * decide he already has everything before, as such WAL is generally
+	 * entirely different than the correct (donor) one.
+	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE)
+		if (walkeeper[i].state == SS_IDLE && i != donor)
 		{
-			for (WalMessage* msg = msgQueueHead; msg != NULL; msg = msg->next)
-			{
-				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
-				{
-					msg->ackMask |= 1 << i; /* message is already received by this walkeeper */
-				}
-				else
-				{
-					SendMessageToNode(i, msg);
-					break;
-				}
-			}
+			SendMessageToNode(i, msgQueueHead);
 		}
 	}
 	return true;

From 3e79e7b571df44d6a0bab43f88deea540fa45000 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 14 Sep 2021 16:41:35 +0300
Subject: [PATCH 050/167] Mark all recovery messages as received by the donor.

I forgot to do that in 42316a81d3. Fixes segfault related to attempt to send the
(garbage collected) message second time and queue advancement when donor doesn't
restart.
---
 src/backend/replication/walproposer.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index f8aed25255b..5deb83579d6 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -613,9 +613,8 @@ SendMessageToNode(int i, WalMessage* msg)
 	/* we shouldn't be already sending something */
 	Assert(wk->currMsg == NULL);
 	/*
-	 * Skip already acknowledged messages. Used during start to get to the
-	 * first not yet received message. Otherwise we always just send
-	 * 'msg'.
+	 * Skip already acknowledged messages. Used after reconnection to get to the
+	 * first not yet sent message. Otherwise we always just send 'msg'.
 	 */
 	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 		msg = msg->next;
@@ -931,6 +930,11 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 			SendMessageToNode(i, msgQueueHead);
 		}
 	}
+	/* Mark all recovery messages as already received by the donor. */
+	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
+	{
+		msg->ackMask |= 1 << donor;
+	}
 	return true;
 }
 
@@ -1243,7 +1247,12 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
-						/* this message signifies epoch switch */
+						/*
+						 * This message signifies epoch switch; it is needed to
+						 * make the switch happen on donor, as he won't get any
+						 * other messages until we start writing new WAL (and we
+						 * e.g. don't in --sync mode at all)
+						 */
 						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
 						if (syncSafekeepers)

From 2750bac477d50fc01338fc86eeef21046b83b531 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 14 Sep 2021 17:41:49 +0300
Subject: [PATCH 051/167] Don't FATAL in walproposer when EOF arrives in
 SS_IDLE state.

---
 src/backend/replication/walproposer.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5deb83579d6..2ae1e4d86aa 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -224,8 +224,6 @@ ResetConnection(int i)
 
 	if (wk->state != SS_OFFLINE)
 	{
-		elog(WARNING, "Connection with node %s:%s in %s state failed",
-			wk->host, wk->port, FormatWalKeeperState(wk->state));
 		ShutdownConnection(i);
 	}
 
@@ -1187,9 +1185,10 @@ AdvancePollState(int i, uint32 events)
 			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
 			 * SS_SEND_VOTE. */
 			case SS_VOTING:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is voting",
-					 wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(i);
+				break;
 
 			/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
@@ -1276,8 +1275,10 @@ AdvancePollState(int i, uint32 events)
 			/* Idle state for sending WAL. Moved out only by calls to
 			 * SendMessageToNode */
 			case SS_IDLE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is idle", wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(i);
+				break;
 
 			/* Start to send the message at wk->currMsg. Triggered only by calls
 			 * to SendMessageToNode */

From 153d0f64683b64991363e77e52fdbc7dde66cc7d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 15 Sep 2021 15:13:14 +0300
Subject: [PATCH 052/167] Optimize walproposer starting streaming point.

Safekeepers who are in the same epoch as donor definitely have correct WAL, so
we can send to them since their flushLsn. This required some additionall fuss
due to convention of always starting streaming at the record boundary.
---
 src/backend/replication/walproposer.c | 94 ++++++++++++++++++++-------
 src/include/replication/walproposer.h |  6 ++
 2 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 2ae1e4d86aa..d5e9730c526 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -368,6 +368,11 @@ HandleWalKeeperResponse(void)
 			truncateLsn = candidateTruncateLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			if (msg->perSafekeeper[i])
+				free(msg->perSafekeeper[i]);
+		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
@@ -677,6 +682,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
 	msg->ackMask = 0;
+	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -718,6 +724,7 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
 	msg->ackMask = 0;
+	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -912,26 +919,54 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 
 	/*
 	 * Start sending entries to everyone from the beginning (truncateLsn),
-	 * except for donor who doesn't need recovery at all. We could do here
-	 * better, taking into account commitLsn of safekeepers to avoid sending
-	 * them excessive data, but this requires some effort (note also that we
-	 * must always start sending from the beginning of the record).
-	 *
-	 * And note that we definitely can't pick up flushLsn of safekeeper and
-	 * decide he already has everything before, as such WAL is generally
-	 * entirely different than the correct (donor) one.
+	 * except for those who lives in donor's epoch and thus for sure has correct
+	 * WAL. We could do here even slightly better, taking into account commitLsn
+	 * of the rest to avoid sending them excessive data.
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE && i != donor)
+		if (walkeeper[i].state != SS_IDLE)
+			continue;
+
+		if (walkeeper[i].voteResponse.epoch != donorEpoch)
 		{
 			SendMessageToNode(i, msgQueueHead);
 		}
-	}
-	/* Mark all recovery messages as already received by the donor. */
-	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-	{
-		msg->ackMask |= 1 << donor;
+		else
+		{
+			for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
+			{
+				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
+				{
+					/* message is already received by this walkeeper */
+					msg->ackMask |= 1 << i;
+				}
+				else
+				{
+					uint32 len;
+					uint32 size;
+
+					/*
+					 * By convention we always stream since the beginning of the
+					 * record, and flushLsn points to it -- form the message
+					 * starting there.
+					 */
+					len = msg->req.endLsn - walkeeper[i].voteResponse.flushLsn;
+					size = sizeof(AppendRequestHeader) + len;
+					msg->perSafekeeper[i] = malloc(size);
+					*msg->perSafekeeper[i] = msg->req;
+					msg->perSafekeeper[i]->beginLsn =
+						walkeeper[i].voteResponse.flushLsn;
+					memcpy(&msg->perSafekeeper[i] + 1,
+						   (char *) (&msg->req + 1) +
+							   walkeeper[i].voteResponse.flushLsn -
+							   msg->req.beginLsn,
+						   len);
+					SendMessageToNode(i, msg);
+					break;
+				}
+			}
+		}
 	}
 	return true;
 }
@@ -1206,11 +1241,17 @@ AdvancePollState(int i, uint32 events)
 				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
+				elog(LOG,
+					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+					 wk->host, wk->port, wk->voteResponse.voteGiven, wk->voteResponse.epoch,
+					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
+					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+
 				/*
-				 * In case of acceptor rejecting our vote, bail out, but only if
-				 * either it already lives in strictly higher term (concurrent
-				 * compute spotted) or we are not elected yet and thus need the
-				 * vote.
+				 * In case of acceptor rejecting our vote, bail out, but only
+				 * if either it already lives in strictly higher term
+				 * (concurrent compute spotted) or we are not elected yet and
+				 * thus need the vote.
 				 */
 				if ((!wk->voteResponse.voteGiven) &&
 					(wk->voteResponse.term > propTerm || n_votes < quorum))
@@ -1285,19 +1326,26 @@ AdvancePollState(int i, uint32 events)
 			case SS_SEND_WAL:
 			{
 				WalMessage* msg = wk->currMsg;
+				AppendRequestHeader *req = &msg->req;
+
+				/* if there is a message specially crafted for this safekeeper, send it */
+				if (msg->perSafekeeper[i])
+					req = msg->perSafekeeper[i];
 
 				elog(LOG,
-					 "sending message with len %ld beginLsn=%X/%X "
-					 "commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+					 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 					 msg->size - sizeof(AppendRequestHeader),
-					 LSN_FORMAT_ARGS(msg->req.beginLsn),
-					 LSN_FORMAT_ARGS(msg->req.commitLsn),
+					 LSN_FORMAT_ARGS(req->beginLsn),
+					 LSN_FORMAT_ARGS(req->commitLsn),
 					 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
 				/* We write with msg->size here because the body of the message
 				 * is stored after the end of the WalMessage struct, in the
 				 * allocation for each msg */
-				if (!AsyncWrite(i, &msg->req, msg->size, SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+				if (!AsyncWrite(i, req,
+								sizeof(AppendRequestHeader) + req->endLsn -
+									req->beginLsn,
+								SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
 					return;
 
 				break;
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 99e62142736..c455d0564e9 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -259,6 +259,12 @@ struct WalMessage
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
 	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
+	/*
+	 * By convention safekeeper starts receiving data since record boundary, we
+	 * may need to send first message not from the chunk beginning for that;
+	 * such trimmed message is formed here.
+	 */
+	AppendRequestHeader *perSafekeeper[MAX_WALKEEPERS];
 	AppendRequestHeader req; /* request to walkeeper (message header) */
 
 	/* PHANTOM FIELD:

From 7648f055b862e1e9489426c2a4ef5d65279cc0eb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:27 +0300
Subject: [PATCH 053/167] Silence a compiler warning.

This one:

    src/backend/tcop/zenith_wal_redo.c:294:2: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
      294 |  bool enable_seccomp = true;
          |  ^~~~

There are a few more warnings that look a bit more tricky to fix, but let's
at least silence this for now.

In the passing, also move the 'n_synced' local variable closer to where
it's used.
---
 src/backend/replication/walproposer.c | 4 +++-
 src/backend/tcop/zenith_wal_redo.c    | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d5e9730c526..5eab36461f8 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -315,7 +315,6 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr minQuorumLsn;
-	int n_synced = 0;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
@@ -393,6 +392,9 @@ HandleWalKeeperResponse(void)
 	 */
 	if (syncSafekeepers)
 	{
+		int			n_synced;
+
+		n_synced = 0;
 		for (int i = 0; i < n_walkeepers; i++)
 		{
 			WalKeeper *wk = &walkeeper[i];
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 7e00a9e985d..1a17a3202ef 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -156,6 +156,7 @@ WalRedoMain(int argc, char *argv[],
 {
 	int			firstchar;
 	StringInfoData input_message;
+	bool		enable_seccomp;
 
 	/* Initialize startup process environment if necessary. */
 	InitStandaloneProcess(argv[0]);
@@ -291,7 +292,7 @@ WalRedoMain(int argc, char *argv[],
 
 #ifdef HAVE_LIBSECCOMP
 	/* We prefer opt-out to opt-in for greater security */
-	bool enable_seccomp = true;
+	enable_seccomp = true;
 	for (int i = 1; i < argc; i++)
 		if (strcmp(argv[i], "--disable-seccomp") == 0)
 			enable_seccomp = false;

From afaad8f49996c57d9582163c19259930527c6c5b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:30 +0300
Subject: [PATCH 054/167] Remove unused functions for reading non-rel pages.

These could be used to fetch SLRUs and other non-relation things from the
page server. But we don't do that, and have no plans in the near future.
---
 contrib/zenith/pagestore_client.h |  3 --
 contrib/zenith/pagestore_smgr.c   | 73 ++-----------------------------
 2 files changed, 4 insertions(+), 72 deletions(-)

diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index dbcaa5fdb91..d94cbcb5185 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -123,9 +123,6 @@ extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber nblocks);
 extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
 
-extern bool zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum);
-extern void zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum);
-
 /* zenith wal-redo storage manager functionality */
 
 extern void inmem_init(void);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 569d1c330d8..be8277b2fee 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -414,7 +414,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool nonrel)
+zenith_get_request_lsn(void)
 {
 	XLogRecPtr lsn;
 
@@ -431,11 +431,6 @@ zenith_get_request_lsn(bool nonrel)
 		lsn = InvalidXLogRecPtr;
 		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
 	}
-	else if (nonrel)
-	{
-		lsn = GetLastImportantRecPtr();
-		elog(DEBUG1, "zenith_get_request_lsn norel GetLastImportantRecPtr  %X/%X", (uint32) ((lsn) >> 32), (uint32) (lsn));
-	}
 	else
 	{
 		XLogRecPtr flushlsn;
@@ -485,7 +480,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum
 		},
-		.lsn = zenith_get_request_lsn(false)
+		.lsn = zenith_get_request_lsn()
 	});
 	ok = resp->ok;
 	pfree(resp);
@@ -640,7 +635,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	ZenithResponse *resp;
 	XLogRecPtr request_lsn;
 
-	request_lsn = zenith_get_request_lsn(false);
+	request_lsn = zenith_get_request_lsn();
 	resp = page_server->request((ZenithRequest) {
 		.tag = T_ZenithReadRequest,
 		.page_key = {
@@ -765,66 +760,6 @@ hexdump_page(char *page)
 }
 #endif
 
-
-bool
-zenith_nonrel_page_exists(RelFileNode rnode, BlockNumber blkno, int forknum)
-{
-	bool ok;
-	ZenithResponse *resp;
-
-	elog(SmgrTrace, "[ZENITH_SMGR] zenith_nonrel_page_exists relnode %u/%u/%u_%d blkno %u",
-		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno);
-
-	resp = page_server->request((ZenithRequest) {
-		.tag = T_ZenithExistsRequest,
-		.page_key = {
-			.rnode = rnode,
-			.forknum = forknum,
-			.blkno = blkno
-		},
-		.lsn = zenith_get_request_lsn(true)
-	});
-	ok = resp->ok;
-	pfree(resp);
-	return ok;
-}
-
-void
-zenith_read_nonrel(RelFileNode rnode, BlockNumber blkno, char *buffer, int forknum)
-{
-	int bufsize = BLCKSZ;
-	ZenithResponse *resp;
-	XLogRecPtr lsn;
-
-	//43 is magic for RELMAPPER_FILENAME in page cache
-	// relmapper files has non-standard size of 512bytes
-	if (forknum == 43)
-		bufsize = 512;
-
-	lsn = zenith_get_request_lsn(true);
-
-	elog(SmgrTrace, "[ZENITH_SMGR] read nonrel relnode %u/%u/%u_%d blkno %u lsn %X/%X",
-		rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blkno,
-		(uint32) ((lsn) >> 32), (uint32) (lsn));
-
-	resp = page_server->request((ZenithRequest) {
-		.tag = T_ZenithReadRequest,
-		.page_key = {
-			.rnode = rnode,
-			.forknum = forknum,
-			.blkno = blkno
-		},
-		.lsn = lsn
-	});
-
-	if (!resp->ok)
-		elog(ERROR, "[ZENITH_SMGR] smgr page not found");
-
-	memcpy(buffer, resp->page, bufsize);
-	pfree(resp);
-}
-
-
 /*
  *	zenith_write() -- Write the supplied block at the appropriate location.
  *
@@ -867,7 +802,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
 		return n_blocks;
 
-	request_lsn = zenith_get_request_lsn(false);
+	request_lsn = zenith_get_request_lsn();
 	resp = page_server->request((ZenithRequest) {
 		.tag = T_ZenithNblocksRequest,
 		.page_key = {

From ae5ee63f947b01f25e0137c21400051722ebb733 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:32 +0300
Subject: [PATCH 055/167] Misc cleanup in the code that communicates with the
 page server.

- Remove unused 'system_id' field from ZenithRequest.
- Remove unused 'loaded' variable.
- Remove unused to pack pageserver->client messages, and to unpack
  client->pageserver messages.
- Fix printing the response in debug message (was printing the request
  twice)
- Avoid the overhead of converting request/response to string, unless
  the debug message is really going to be printed
- Formatting fixes.
---
 contrib/zenith/libpagestore.c     | 15 ++++----
 contrib/zenith/pagestore_client.h | 23 ++++++------
 contrib/zenith/pagestore_smgr.c   | 61 ++++++++++---------------------
 3 files changed, 39 insertions(+), 60 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index b726cee80f8..292920f56da 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -42,7 +42,7 @@ void		_PG_init(void);
 bool		connected = false;
 PGconn	   *pageserver_conn;
 
-static ZenithResponse * zenith_call(ZenithRequest request);
+static ZenithResponse * zenith_call(ZenithRequest *request);
 page_server_api api = {
 	.request = zenith_call
 };
@@ -200,7 +200,7 @@ zenith_connect()
 
 
 static ZenithResponse *
-zenith_call(ZenithRequest request)
+zenith_call(ZenithRequest *request)
 {
 	StringInfoData req_buff;
 	StringInfoData resp_buff;
@@ -217,7 +217,7 @@ zenith_call(ZenithRequest request)
 	if (!connected)
 		zenith_connect();
 
-	req_buff = zm_pack((ZenithMessage *) & request);
+	req_buff = zm_pack_request(request);
 
 	/* send request */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
@@ -227,8 +227,9 @@ zenith_call(ZenithRequest request)
 	}
 	pfree(req_buff.data);
 
+	if (message_level_is_interesting(PqPageStoreTrace))
 	{
-		char	   *msg = zm_to_string((ZenithMessage *) & request);
+		char	   *msg = zm_to_string((ZenithMessage *) request);
 
 		zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
 		pfree(msg);
@@ -243,21 +244,21 @@ zenith_call(ZenithRequest request)
 	else if (resp_buff.len == -2)
 		zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 
-	resp = zm_unpack(&resp_buff);
+	resp = zm_unpack_response(&resp_buff);
 	PQfreemem(resp_buff.data);
 
 	Assert(messageTag(resp) == T_ZenithStatusResponse
 		   || messageTag(resp) == T_ZenithNblocksResponse
 		   || messageTag(resp) == T_ZenithReadResponse);
 
+	if (message_level_is_interesting(PqPageStoreTrace))
 	{
-		char	   *msg = zm_to_string((ZenithMessage *) & request);
+		char	   *msg = zm_to_string((ZenithMessage *) resp);
 
 		zenith_log(PqPageStoreTrace, "Got response: %s", msg);
 		pfree(msg);
 	}
 
-
 	/*
 	 * XXX: zm_to_string leak strings. Check with what memory contex all this
 	 * methods are called.
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index d94cbcb5185..9600c974f70 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -36,14 +36,14 @@ typedef enum
 	T_ZenithStatusResponse = 100,
 	T_ZenithNblocksResponse,
 	T_ZenithReadResponse,
-}			ZenithMessageTag;
+} ZenithMessageTag;
 
 
 /* base struct for c-style inheritance */
 typedef struct
 {
 	ZenithMessageTag tag;
-}			ZenithMessage;
+} ZenithMessage;
 
 #define messageTag(m)		(((const ZenithMessage *)(m))->tag)
 
@@ -54,15 +54,14 @@ typedef struct
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-}			PageKey;
+} PageKey;
 
 typedef struct
 {
 	ZenithMessageTag tag;
-	uint64		system_id;
 	PageKey		page_key;
 	XLogRecPtr	lsn;			/* request page version @ this LSN */
-}			ZenithRequest;
+} ZenithRequest;
 
 typedef struct
 {
@@ -70,11 +69,11 @@ typedef struct
 	bool		ok;
 	uint32		n_blocks;
 	char		page[1];
-}			ZenithResponse;
+} ZenithResponse;
 
-StringInfoData zm_pack(ZenithMessage * msg);
-ZenithMessage *zm_unpack(StringInfo s);
-char	   *zm_to_string(ZenithMessage * msg);
+extern StringInfoData zm_pack_request(ZenithRequest *msg);
+extern ZenithMessage *zm_unpack_response(StringInfo s);
+extern char *zm_to_string(ZenithMessage *msg);
 
 /*
  * API
@@ -82,10 +81,10 @@ char	   *zm_to_string(ZenithMessage * msg);
 
 typedef struct
 {
-	ZenithResponse *(*request) (ZenithRequest request);
-}			page_server_api;
+	ZenithResponse *(*request) (ZenithRequest *request);
+} page_server_api;
 
-extern page_server_api * page_server;
+extern page_server_api *page_server;
 
 extern char *page_server_connstring;
 extern char *callmemaybe_connstring;
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index be8277b2fee..62c30808481 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -50,8 +50,6 @@ static char *hexdump_page(char *page);
 
 const int SmgrTrace = DEBUG5;
 
-bool loaded = false;
-
 page_server_api *page_server;
 
 /* GUCs */
@@ -72,7 +70,7 @@ char const *const ZenithMessageStr[] =
 };
 
 StringInfoData
-zm_pack(ZenithMessage *msg)
+zm_pack_request(ZenithRequest *msg)
 {
 	StringInfoData	s;
 
@@ -98,56 +96,25 @@ zm_pack(ZenithMessage *msg)
 			break;
 		}
 
-		/* pagestore -> pagestore_client */
+		/* pagestore -> pagestore_client. We never need to create these. */
 		case T_ZenithStatusResponse:
 		case T_ZenithNblocksResponse:
-		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
-			pq_sendbyte(&s, msg_resp->ok);
-			pq_sendint32(&s, msg_resp->n_blocks);
-			break;
-		}
 		case T_ZenithReadResponse:
-		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
-			pq_sendbyte(&s, msg_resp->ok);
-			pq_sendint32(&s, msg_resp->n_blocks);
-			pq_sendbytes(&s, msg_resp->page, BLCKSZ); // XXX: should be varlena
+		default:
+			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
 			break;
-		}
 	}
 	return s;
 }
 
 ZenithMessage *
-zm_unpack(StringInfo s)
+zm_unpack_response(StringInfo s)
 {
 	ZenithMessageTag tag = pq_getmsgbyte(s);
 	ZenithMessage *msg = NULL;
 
 	switch (tag)
 	{
-		/* pagestore_client -> pagestore */
-		case T_ZenithExistsRequest:
-		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
-		{
-			ZenithRequest *msg_req = palloc0(sizeof(ZenithRequest));
-
-			msg_req->tag = tag;
-			msg_req->system_id = 42;
-			msg_req->page_key.rnode.spcNode = pq_getmsgint(s, 4);
-			msg_req->page_key.rnode.dbNode = pq_getmsgint(s, 4);
-			msg_req->page_key.rnode.relNode = pq_getmsgint(s, 4);
-			msg_req->page_key.forknum = pq_getmsgbyte(s);
-			msg_req->page_key.blkno = pq_getmsgint(s, 4);
-			msg_req->lsn = pq_getmsgint64(s);
-			pq_getmsgend(s);
-
-			msg = (ZenithMessage *) msg_req;
-			break;
-		}
-
 		/* pagestore -> pagestore_client */
 		case T_ZenithStatusResponse:
 		case T_ZenithNblocksResponse:
@@ -176,6 +143,18 @@ zm_unpack(StringInfo s)
 			msg = (ZenithMessage *) msg_resp;
 			break;
 		}
+
+		/*
+		 * pagestore_client -> pagestore
+		 *
+		 * We create these ourselves, and don't need to decode them.
+		 */
+		case T_ZenithExistsRequest:
+		case T_ZenithNblocksRequest:
+		case T_ZenithReadRequest:
+		default:
+			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
+			break;
 	}
 
 	return msg;
@@ -474,7 +453,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		ok;
 	ZenithResponse *resp;
 
-	resp = page_server->request((ZenithRequest) {
+	resp = page_server->request(&(ZenithRequest) {
 		.tag = T_ZenithExistsRequest,
 		.page_key = {
 			.rnode = reln->smgr_rnode.node,
@@ -636,7 +615,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	XLogRecPtr request_lsn;
 
 	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request((ZenithRequest) {
+	resp = page_server->request(&(ZenithRequest) {
 		.tag = T_ZenithReadRequest,
 		.page_key = {
 			.rnode = reln->smgr_rnode.node,
@@ -803,7 +782,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 
 	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request((ZenithRequest) {
+	resp = page_server->request(&(ZenithRequest) {
 		.tag = T_ZenithNblocksRequest,
 		.page_key = {
 			.rnode = reln->smgr_rnode.node,

From bf4c0cbeb465c2bfa7294821a67277df7a729b01 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:07:34 +0300
Subject: [PATCH 056/167] Improve the protocol between Postgres and page
 server.

- Use different message formats for different kinds of response messages.
- Add an Error response message, for passing errors from page server to
  Postgres. An Error response now results in an ereport(ERROR
- Add a flag to requests, to indicate that we actually want the latest
  page version on the timeline, and the LSN is just a hint that we know
  that there haven't been any modifications since that LSN. It is currently
  always set to 'true', but once we start supporting read-only replicas,
  they would set it to false.

This changes the network postgres<->page server protocol, so this needs
corresponding changes in the page server side

Also refactor and fix the zm_to_string() function. The ZenithMessageStr
array was broken, because the array indices didn't match the
ZenithMessageTag enum values.
---
 contrib/zenith/libpagestore.c     |   6 +-
 contrib/zenith/pagestore_client.h |  68 ++++--
 contrib/zenith/pagestore_smgr.c   | 366 ++++++++++++++++++++++--------
 3 files changed, 323 insertions(+), 117 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 292920f56da..f29cb8f509c 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -204,7 +204,7 @@ zenith_call(ZenithRequest *request)
 {
 	StringInfoData req_buff;
 	StringInfoData resp_buff;
-	ZenithMessage *resp;
+	ZenithResponse *resp;
 
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -247,10 +247,6 @@ zenith_call(ZenithRequest *request)
 	resp = zm_unpack_response(&resp_buff);
 	PQfreemem(resp_buff.data);
 
-	Assert(messageTag(resp) == T_ZenithStatusResponse
-		   || messageTag(resp) == T_ZenithNblocksResponse
-		   || messageTag(resp) == T_ZenithReadResponse);
-
 	if (message_level_is_interesting(PqPageStoreTrace))
 	{
 		char	   *msg = zm_to_string((ZenithMessage *) resp);
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 9600c974f70..073568f90c3 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -30,15 +30,15 @@ typedef enum
 	/* pagestore_client -> pagestore */
 	T_ZenithExistsRequest = 0,
 	T_ZenithNblocksRequest,
-	T_ZenithReadRequest,
+	T_ZenithGetPageRequest,
 
 	/* pagestore -> pagestore_client */
-	T_ZenithStatusResponse = 100,
+	T_ZenithExistsResponse = 100,
 	T_ZenithNblocksResponse,
-	T_ZenithReadResponse,
+	T_ZenithGetPageResponse,
+	T_ZenithErrorResponse,
 } ZenithMessageTag;
 
-
 /* base struct for c-style inheritance */
 typedef struct
 {
@@ -47,32 +47,74 @@ typedef struct
 
 #define messageTag(m)		(((const ZenithMessage *)(m))->tag)
 
-extern char const *const ZenithMessageStr[];
+/*
+ * supertype of all the Zenith*Request structs below
+ *
+ * If 'latest' is true, we are requesting the latest page version, and 'lsn'
+ * is just a hint to the server that we know there are no versions of the page
+ * (or relation size, for exists/nblocks requests) later than the 'lsn'.
+ */
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		latest;			/* if true, request latest page version */
+	XLogRecPtr	lsn;			/* request page version @ this LSN */
+} ZenithRequest;
+
+typedef struct
+{
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} ZenithExistsRequest;
 
 typedef struct
 {
+	ZenithRequest req;
+	RelFileNode rnode;
+	ForkNumber	forknum;
+} ZenithNblocksRequest;
+
+typedef struct
+{
+	ZenithRequest req;
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
-} PageKey;
+} ZenithGetPageRequest;
 
+/* supertype of all the Zenith*Response structs below */
 typedef struct
 {
 	ZenithMessageTag tag;
-	PageKey		page_key;
-	XLogRecPtr	lsn;			/* request page version @ this LSN */
-} ZenithRequest;
+} ZenithResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	bool		exists;
+} ZenithExistsResponse;
 
 typedef struct
 {
 	ZenithMessageTag tag;
-	bool		ok;
 	uint32		n_blocks;
-	char		page[1];
-} ZenithResponse;
+} ZenithNblocksResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	char		page[FLEXIBLE_ARRAY_MEMBER];
+} ZenithGetPageResponse;
+
+typedef struct
+{
+	ZenithMessageTag tag;
+	char		message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */
+} ZenithErrorResponse;
 
 extern StringInfoData zm_pack_request(ZenithRequest *msg);
-extern ZenithMessage *zm_unpack_response(StringInfo s);
+extern ZenithResponse *zm_unpack_response(StringInfo s);
 extern char *zm_to_string(ZenithMessage *msg);
 
 /*
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 62c30808481..6a2745eb944 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -59,16 +59,6 @@ char *zenith_timeline;
 char *zenith_tenant;
 bool wal_redo = false;
 
-char const *const ZenithMessageStr[] =
-{
-	"ZenithExistsRequest",
-	"ZenithNblocksRequest",
-	"ZenithReadRequest",
-	"ZenithStatusResponse",
-	"ZenithReadResponse",
-	"ZenithNblocksResponse",
-};
-
 StringInfoData
 zm_pack_request(ZenithRequest *msg)
 {
@@ -81,25 +71,51 @@ zm_pack_request(ZenithRequest *msg)
 	{
 		/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
+		{
+			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+			pq_sendbyte(&s, msg_req->req.latest);
+			pq_sendint64(&s, msg_req->req.lsn);
+			pq_sendint32(&s, msg_req->rnode.spcNode);
+			pq_sendint32(&s, msg_req->rnode.dbNode);
+			pq_sendint32(&s, msg_req->rnode.relNode);
+			pq_sendbyte(&s, msg_req->forknum);
+
+			break;
+		}
 		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
 		{
-			ZenithRequest *msg_req = (ZenithRequest *) msg;
+			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
 
-			pq_sendint32(&s, msg_req->page_key.rnode.spcNode);
-			pq_sendint32(&s, msg_req->page_key.rnode.dbNode);
-			pq_sendint32(&s, msg_req->page_key.rnode.relNode);
-			pq_sendbyte(&s, msg_req->page_key.forknum);
-			pq_sendint32(&s, msg_req->page_key.blkno);
-			pq_sendint64(&s, msg_req->lsn);
+			pq_sendbyte(&s, msg_req->req.latest);
+			pq_sendint64(&s, msg_req->req.lsn);
+			pq_sendint32(&s, msg_req->rnode.spcNode);
+			pq_sendint32(&s, msg_req->rnode.dbNode);
+			pq_sendint32(&s, msg_req->rnode.relNode);
+			pq_sendbyte(&s, msg_req->forknum);
+
+			break;
+		}
+		case T_ZenithGetPageRequest:
+		{
+			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+			pq_sendbyte(&s, msg_req->req.latest);
+			pq_sendint64(&s, msg_req->req.lsn);
+			pq_sendint32(&s, msg_req->rnode.spcNode);
+			pq_sendint32(&s, msg_req->rnode.dbNode);
+			pq_sendint32(&s, msg_req->rnode.relNode);
+			pq_sendbyte(&s, msg_req->forknum);
+			pq_sendint32(&s, msg_req->blkno);
 
 			break;
 		}
 
 		/* pagestore -> pagestore_client. We never need to create these. */
-		case T_ZenithStatusResponse:
+		case T_ZenithExistsResponse:
 		case T_ZenithNblocksResponse:
-		case T_ZenithReadResponse:
+		case T_ZenithGetPageResponse:
+		case T_ZenithErrorResponse:
 		default:
 			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
 			break;
@@ -107,40 +123,66 @@ zm_pack_request(ZenithRequest *msg)
 	return s;
 }
 
-ZenithMessage *
+ZenithResponse *
 zm_unpack_response(StringInfo s)
 {
 	ZenithMessageTag tag = pq_getmsgbyte(s);
-	ZenithMessage *msg = NULL;
+	ZenithResponse *resp = NULL;
 
 	switch (tag)
 	{
 		/* pagestore -> pagestore_client */
-		case T_ZenithStatusResponse:
+		case T_ZenithExistsResponse:
+		{
+			ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
+
+			msg_resp->tag = tag;
+			msg_resp->exists = pq_getmsgbyte(s);
+			pq_getmsgend(s);
+
+			resp = (ZenithResponse *) msg_resp;
+			break;
+		}
+
 		case T_ZenithNblocksResponse:
 		{
-			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse));
+			ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
 
 			msg_resp->tag = tag;
-			msg_resp->ok = pq_getmsgbyte(s);
 			msg_resp->n_blocks = pq_getmsgint(s, 4);
 			pq_getmsgend(s);
 
-			msg = (ZenithMessage *) msg_resp;
+			resp = (ZenithResponse *) msg_resp;
 			break;
 		}
 
-		case T_ZenithReadResponse:
+		case T_ZenithGetPageResponse:
 		{
-			ZenithResponse *msg_resp = palloc0(sizeof(ZenithResponse) + BLCKSZ);
+			ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
 
 			msg_resp->tag = tag;
-			msg_resp->ok = pq_getmsgbyte(s);
-			msg_resp->n_blocks = pq_getmsgint(s, 4);
 			memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); // XXX: should be varlena
 			pq_getmsgend(s);
 
-			msg = (ZenithMessage *) msg_resp;
+			resp = (ZenithResponse *) msg_resp;
+			break;
+		}
+
+		case T_ZenithErrorResponse:
+		{
+			ZenithErrorResponse *msg_resp;
+			size_t		msglen;
+			const char *msgtext;
+
+			msgtext = pq_getmsgrawstring(s);
+			msglen = strlen(msgtext);
+
+			msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
+			msg_resp->tag = tag;
+			memcpy(msg_resp->message, msgtext, msglen + 1);
+			pq_getmsgend(s);
+
+			resp = (ZenithResponse *) msg_resp;
 			break;
 		}
 
@@ -151,13 +193,13 @@ zm_unpack_response(StringInfo s)
 		 */
 		case T_ZenithExistsRequest:
 		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
+		case T_ZenithGetPageRequest:
 		default:
-			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
+			elog(ERROR, "unexpected zenith message tag 0x%02x", tag);
 			break;
 	}
 
-	return msg;
+	return resp;
 }
 
 /* dump to json for debugging / error reporting purposes */
@@ -168,52 +210,107 @@ zm_to_string(ZenithMessage *msg)
 
 	initStringInfo(&s);
 
-	appendStringInfoString(&s, "{");
-	appendStringInfo(&s, "\"type\": \"%s\"", ZenithMessageStr[msg->tag]);
-
 	switch (messageTag(msg))
 	{
 		/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
-		case T_ZenithNblocksRequest:
-		case T_ZenithReadRequest:
 		{
-			ZenithRequest *msg_req = (ZenithRequest *) msg;
+			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
+			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+							 msg_req->rnode.spcNode,
+							 msg_req->rnode.dbNode,
+							 msg_req->rnode.relNode);
+			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+			appendStringInfoChar(&s, '}');
+			break;
+		}
 
-			appendStringInfo(&s, ", \"page_key\": \"%d.%d.%d.%d.%u\", \"lsn\": \"%X/%X\"}",
-							 msg_req->page_key.rnode.spcNode,
-							 msg_req->page_key.rnode.dbNode,
-							 msg_req->page_key.rnode.relNode,
-							 msg_req->page_key.forknum,
-							 msg_req->page_key.blkno,
-							 (uint32) (msg_req->lsn >> 32), (uint32) (msg_req->lsn));
+		case T_ZenithNblocksRequest:
+		{
+			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
+			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+							 msg_req->rnode.spcNode,
+							 msg_req->rnode.dbNode,
+							 msg_req->rnode.relNode);
+			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+			appendStringInfoChar(&s, '}');
+			break;
+		}
 
+		case T_ZenithGetPageRequest:
+		{
+			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
+			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+							 msg_req->rnode.spcNode,
+							 msg_req->rnode.dbNode,
+							 msg_req->rnode.relNode);
+			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+			appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+			appendStringInfoChar(&s, '}');
 			break;
 		}
 
 		/* pagestore -> pagestore_client */
-		case T_ZenithStatusResponse:
-		case T_ZenithNblocksResponse:
+		case T_ZenithExistsResponse:
 		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
 
-			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u}",
-				msg_resp->ok,
-				msg_resp->n_blocks
+			appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
+			appendStringInfo(&s, ", \"exists\": %d}",
+				msg_resp->exists
 			);
+			appendStringInfoChar(&s, '}');
 
 			break;
 		}
-		case T_ZenithReadResponse:
+		case T_ZenithNblocksResponse:
 		{
-			ZenithResponse *msg_resp = (ZenithResponse *) msg;
+			ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
 
-			appendStringInfo(&s, ", \"ok\": %d, \"n_blocks\": %u, \"page\": \"XXX\"}",
-				msg_resp->ok,
+			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
+			appendStringInfo(&s, ", \"n_blocks\": %u}",
 				msg_resp->n_blocks
 			);
+			appendStringInfoChar(&s, '}');
+
 			break;
 		}
+		case T_ZenithGetPageResponse:
+		{
+#if 0
+			ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
+#endif
+
+			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
+			appendStringInfo(&s, ", \"page\": \"XXX\"}");
+			appendStringInfoChar(&s, '}');
+			break;
+		}
+		case T_ZenithErrorResponse:
+		{
+			ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
+
+			/* FIXME: escape double-quotes in the message */
+			appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
+			appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+			appendStringInfoChar(&s, '}');
+			break;
+		}
+
+		default:
+			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
 	}
 	return s.data;
 }
@@ -393,7 +490,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(void)
+zenith_get_request_lsn(bool *latest)
 {
 	XLogRecPtr lsn;
 
@@ -402,7 +499,6 @@ zenith_get_request_lsn(void)
 		lsn = GetXLogReplayRecPtr(NULL);
 		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			(uint32) ((lsn) >> 32), (uint32) (lsn));
-
 		lsn = InvalidXLogRecPtr;
 	}
 	else if (am_walsender)
@@ -440,6 +536,12 @@ zenith_get_request_lsn(void)
 			XLogFlush(lsn);
 		}
 	}
+
+	/*
+	 * FIXME: In read-only mode, we would need to set *latest=false here. But we don't
+	 * support read-only mode at the moment
+	 */
+	*latest = true;
 	return lsn;
 }
 
@@ -450,20 +552,46 @@ zenith_get_request_lsn(void)
 bool
 zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
-	bool		ok;
+	bool		exists;
 	ZenithResponse *resp;
+	bool		latest;
+	XLogRecPtr	request_lsn;
 
-	resp = page_server->request(&(ZenithRequest) {
-		.tag = T_ZenithExistsRequest,
-		.page_key = {
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithExistsRequest request = {
+			.req.tag = T_ZenithExistsRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum
-		},
-		.lsn = zenith_get_request_lsn()
-	});
-	ok = resp->ok;
+		};
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag) {
+		case T_ZenithExistsResponse:
+			exists = ((ZenithExistsResponse *) resp)->exists;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forkNum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s", 
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
 	pfree(resp);
-	return ok;
+	return exists;
 }
 
 /*
@@ -609,37 +737,52 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
  */
 void
 zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
-				 char *buffer)
+			char *buffer)
 {
 	ZenithResponse *resp;
+	bool		latest;
 	XLogRecPtr request_lsn;
 
-	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request(&(ZenithRequest) {
-		.tag = T_ZenithReadRequest,
-		.page_key = {
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithGetPageRequest request = {
+			.req.tag = T_ZenithGetPageRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum,
 			.blkno = blkno
-		},
-		.lsn = request_lsn
-	});
-
-	if (!resp->ok)
-		ereport(ERROR,
-			(errcode(ERRCODE_IO_ERROR),
-			errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-					blkno,
-					reln->smgr_rnode.node.spcNode,
-					reln->smgr_rnode.node.dbNode,
-					reln->smgr_rnode.node.relNode,
-					forkNum,
-					(uint32) (request_lsn >> 32), (uint32) request_lsn)));
-
-	memcpy(buffer, resp->page, BLCKSZ);
-	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED; /* Clear PD_WAL_LOGGED bit stored in WAL record */
+		};
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag) {
+		case T_ZenithGetPageResponse:
+			memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ);
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							blkno,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forkNum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s", 
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+
 	pfree(resp);
 
+	/* Clear PD_WAL_LOGGED bit stored in WAL record */
+	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED;
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -776,21 +919,46 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	ZenithResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr request_lsn;
+	bool		latest;
+	XLogRecPtr	request_lsn;
 
 	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
 		return n_blocks;
 
-	request_lsn = zenith_get_request_lsn();
-	resp = page_server->request(&(ZenithRequest) {
-		.tag = T_ZenithNblocksRequest,
-		.page_key = {
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithNblocksRequest request = {
+			.req.tag = T_ZenithNblocksRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forknum,
-		},
-		.lsn = request_lsn
-	});
-	n_blocks = resp->n_blocks;
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag) {
+		case T_ZenithNblocksResponse:
+			n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s", 
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
 	update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
 
 	elog(SmgrTrace, "zenith_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",

From 3dc6bfb100e910a02ca7b630ae4e6b5161a32752 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:40:50 +0300
Subject: [PATCH 057/167] Silence compiler warning when building without
 --enable-seccomp

---
 src/backend/tcop/zenith_wal_redo.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 1a17a3202ef..15db900cc8a 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -156,7 +156,9 @@ WalRedoMain(int argc, char *argv[],
 {
 	int			firstchar;
 	StringInfoData input_message;
+#ifdef HAVE_LIBSECCOMP
 	bool		enable_seccomp;
+#endif
 
 	/* Initialize startup process environment if necessary. */
 	InitStandaloneProcess(argv[0]);

From 75dc8eab981ddc39ca8aab3d55bd213755f6aa95 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 16:57:01 +0300
Subject: [PATCH 058/167] Run 'pgindent' on zenith-specific code in
 contrib/zenith and in walproposer.c

---
 contrib/zenith/libpagestore.c         |  92 +--
 contrib/zenith/pagestore_smgr.c       | 525 +++++++--------
 contrib/zenith/relsize_cache.c        |  42 +-
 src/backend/replication/walproposer.c | 884 +++++++++++++++-----------
 src/tools/pgindent/typedefs.list      |  15 +
 5 files changed, 880 insertions(+), 678 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index f29cb8f509c..9fe6e2aea62 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -42,7 +42,7 @@ void		_PG_init(void);
 bool		connected = false;
 PGconn	   *pageserver_conn;
 
-static ZenithResponse * zenith_call(ZenithRequest *request);
+static ZenithResponse *zenith_call(ZenithRequest *request);
 page_server_api api = {
 	.request = zenith_call
 };
@@ -50,32 +50,34 @@ page_server_api api = {
 static void
 zenith_connect()
 {
-	char			 *query;
-	int				  ret;
-	char			 *auth_token;
-	char			 *err = NULL;
+	char	   *query;
+	int			ret;
+	char	   *auth_token;
+	char	   *err = NULL;
 	PQconninfoOption *conn_options;
 	PQconninfoOption *conn_option;
-	int 			 noptions = 0;
+	int			noptions = 0;
 
-    // this is heavily inspired by psql/command.c::do_connect
-	conn_options = PQconninfoParse(
-		page_server_connstring,
-	 	&err
-	);
+	/* this is heavily inspired by psql/command.c::do_connect */
+	conn_options = PQconninfoParse(page_server_connstring, &err);
 
-	if (conn_options == NULL) {
+	if (conn_options == NULL)
+	{
 		/* The error string is malloc'd, so we must free it explicitly */
 		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+
 		PQfreemem(err);
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
-					errmsg("invalid connection string syntax: %s", errcopy)));
+				 errmsg("invalid connection string syntax: %s", errcopy)));
 	}
 
-	// Trying to populate pageserver connection string with auth token from environment.
-	// We are looking for password in with placeholder value like $ENV_VAR_NAME, so if password field is present 
-	// and starts with $ we try to fetch environment variable value and fail loudly if it is not set
+	/*
+	 * Trying to populate pageserver connection string with auth token from
+	 * environment. We are looking for password in with placeholder value like
+	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
+	 * to fetch environment variable value and fail loudly if it is not set.
+	 */
 	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
 	{
 		noptions++;
@@ -83,50 +85,50 @@ zenith_connect()
 		{
 			if (conn_option->val != NULL && conn_option->val[0] != '\0')
 			{
-				// ensure that this is a template
-				if (strncmp(conn_option->val, "$", 1) != 0) {
-					ereport(
-						ERROR,
-						(
-							errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])
-						)
-					);
-				}
-		
+				/* ensure that this is a template */
+				if (strncmp(conn_option->val, "$", 1) != 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
+
 				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
 				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token) {
-					ereport(
-						ERROR,
-						(
-							errcode(ERRCODE_CONNECTION_EXCEPTION),
-							errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])
-						)
-					);
-				} else {
+				if (!auth_token)
+				{
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
+				}
+				else
+				{
 					zenith_log(LOG, "using auth token from environment passed via env");
 
-				// inspired by PQconninfoFree and conninfo_storeval
-				// so just free the old one and replace with freshly malloc'ed one
-				free(conn_option->val);
-				conn_option->val = strdup(auth_token);
+					/*
+					 * inspired by PQconninfoFree and conninfo_storeval so
+					 * just free the old one and replace with freshly
+					 * malloc'ed one
+					 */
+					free(conn_option->val);
+					conn_option->val = strdup(auth_token);
 				}
 			}
 		}
 	}
 
-	// copy values from PQconninfoOption to key/value arrays because PQconnectdbParams accepts options this way
+	/*
+	 * copy values from PQconninfoOption to key/value arrays because
+	 * PQconnectdbParams accepts options this way
+	 */
 	const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
 	const char **values = malloc((noptions + 1) * sizeof(*values));
-	int			 i = 0;
-	
+	int			i = 0;
+
 	for (i = 0; i < noptions; i++)
 	{
 		keywords[i] = conn_options[i].keyword;
 		values[i] = conn_options[i].val;
 	}
-	// add array terminator
+	/* add array terminator */
 	keywords[i] = NULL;
 	values[i] = NULL;
 
@@ -148,7 +150,7 @@ zenith_connect()
 	}
 
 	/* Ask the Page Server to connect to us, and stream WAL from us. */
-	if (callmemaybe_connstring && callmemaybe_connstring[0] 
+	if (callmemaybe_connstring && callmemaybe_connstring[0]
 		&& zenith_tenant
 		&& zenith_timeline)
 	{
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 6a2745eb944..ac7e94f74c0 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -48,70 +48,70 @@ static char *hexdump_page(char *page);
 
 #define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
 
-const int SmgrTrace = DEBUG5;
+const int	SmgrTrace = DEBUG5;
 
 page_server_api *page_server;
 
 /* GUCs */
-char *page_server_connstring;
-char *callmemaybe_connstring;
-char *zenith_timeline;
-char *zenith_tenant;
-bool wal_redo = false;
+char	   *page_server_connstring;
+char	   *callmemaybe_connstring;
+char	   *zenith_timeline;
+char	   *zenith_tenant;
+bool		wal_redo = false;
 
 StringInfoData
 zm_pack_request(ZenithRequest *msg)
 {
-	StringInfoData	s;
+	StringInfoData s;
 
 	initStringInfo(&s);
 	pq_sendbyte(&s, msg->tag);
 
 	switch (messageTag(msg))
 	{
-		/* pagestore_client -> pagestore */
+			/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
-		{
-			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+			{
+				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
 
-			pq_sendbyte(&s, msg_req->req.latest);
-			pq_sendint64(&s, msg_req->req.lsn);
-			pq_sendint32(&s, msg_req->rnode.spcNode);
-			pq_sendint32(&s, msg_req->rnode.dbNode);
-			pq_sendint32(&s, msg_req->rnode.relNode);
-			pq_sendbyte(&s, msg_req->forknum);
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithNblocksRequest:
-		{
-			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+			{
+				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
 
-			pq_sendbyte(&s, msg_req->req.latest);
-			pq_sendint64(&s, msg_req->req.lsn);
-			pq_sendint32(&s, msg_req->rnode.spcNode);
-			pq_sendint32(&s, msg_req->rnode.dbNode);
-			pq_sendint32(&s, msg_req->rnode.relNode);
-			pq_sendbyte(&s, msg_req->forknum);
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithGetPageRequest:
-		{
-			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+			{
+				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
 
-			pq_sendbyte(&s, msg_req->req.latest);
-			pq_sendint64(&s, msg_req->req.lsn);
-			pq_sendint32(&s, msg_req->rnode.spcNode);
-			pq_sendint32(&s, msg_req->rnode.dbNode);
-			pq_sendint32(&s, msg_req->rnode.relNode);
-			pq_sendbyte(&s, msg_req->forknum);
-			pq_sendint32(&s, msg_req->blkno);
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendint32(&s, msg_req->rnode.spcNode);
+				pq_sendint32(&s, msg_req->rnode.dbNode);
+				pq_sendint32(&s, msg_req->rnode.relNode);
+				pq_sendbyte(&s, msg_req->forknum);
+				pq_sendint32(&s, msg_req->blkno);
 
-			break;
-		}
+				break;
+			}
 
-		/* pagestore -> pagestore_client. We never need to create these. */
+			/* pagestore -> pagestore_client. We never need to create these. */
 		case T_ZenithExistsResponse:
 		case T_ZenithNblocksResponse:
 		case T_ZenithGetPageResponse:
@@ -131,66 +131,67 @@ zm_unpack_response(StringInfo s)
 
 	switch (tag)
 	{
-		/* pagestore -> pagestore_client */
+			/* pagestore -> pagestore_client */
 		case T_ZenithExistsResponse:
-		{
-			ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
+			{
+				ZenithExistsResponse *msg_resp = palloc0(sizeof(ZenithExistsResponse));
 
-			msg_resp->tag = tag;
-			msg_resp->exists = pq_getmsgbyte(s);
-			pq_getmsgend(s);
+				msg_resp->tag = tag;
+				msg_resp->exists = pq_getmsgbyte(s);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
 		case T_ZenithNblocksResponse:
-		{
-			ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
+			{
+				ZenithNblocksResponse *msg_resp = palloc0(sizeof(ZenithNblocksResponse));
 
-			msg_resp->tag = tag;
-			msg_resp->n_blocks = pq_getmsgint(s, 4);
-			pq_getmsgend(s);
+				msg_resp->tag = tag;
+				msg_resp->n_blocks = pq_getmsgint(s, 4);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
 		case T_ZenithGetPageResponse:
-		{
-			ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
+			{
+				ZenithGetPageResponse *msg_resp = palloc0(offsetof(ZenithGetPageResponse, page) + BLCKSZ);
 
-			msg_resp->tag = tag;
-			memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); // XXX: should be varlena
-			pq_getmsgend(s);
+				msg_resp->tag = tag;
+				/* XXX:	should be varlena */
+				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
 		case T_ZenithErrorResponse:
-		{
-			ZenithErrorResponse *msg_resp;
-			size_t		msglen;
-			const char *msgtext;
+			{
+				ZenithErrorResponse *msg_resp;
+				size_t		msglen;
+				const char *msgtext;
 
-			msgtext = pq_getmsgrawstring(s);
-			msglen = strlen(msgtext);
+				msgtext = pq_getmsgrawstring(s);
+				msglen = strlen(msgtext);
 
-			msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
-			msg_resp->tag = tag;
-			memcpy(msg_resp->message, msgtext, msglen + 1);
-			pq_getmsgend(s);
+				msg_resp = palloc0(sizeof(ZenithErrorResponse) + msglen + 1);
+				msg_resp->tag = tag;
+				memcpy(msg_resp->message, msgtext, msglen + 1);
+				pq_getmsgend(s);
 
-			resp = (ZenithResponse *) msg_resp;
-			break;
-		}
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
 
-		/*
-		 * pagestore_client -> pagestore
-		 *
-		 * We create these ourselves, and don't need to decode them.
-		 */
+			/*
+			 * pagestore_client -> pagestore
+			 *
+			 * We create these ourselves, and don't need to decode them.
+			 */
 		case T_ZenithExistsRequest:
 		case T_ZenithNblocksRequest:
 		case T_ZenithGetPageRequest:
@@ -206,108 +207,108 @@ zm_unpack_response(StringInfo s)
 char *
 zm_to_string(ZenithMessage *msg)
 {
-	StringInfoData	s;
+	StringInfoData s;
 
 	initStringInfo(&s);
 
 	switch (messageTag(msg))
 	{
-		/* pagestore_client -> pagestore */
+			/* pagestore_client -> pagestore */
 		case T_ZenithExistsRequest:
-		{
-			ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
-
-			appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
-			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-							 msg_req->rnode.spcNode,
-							 msg_req->rnode.dbNode,
-							 msg_req->rnode.relNode);
-			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+			{
+				ZenithExistsRequest *msg_req = (ZenithExistsRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithExistsRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
 		case T_ZenithNblocksRequest:
-		{
-			ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
-
-			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
-			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-							 msg_req->rnode.spcNode,
-							 msg_req->rnode.dbNode,
-							 msg_req->rnode.relNode);
-			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+			{
+				ZenithNblocksRequest *msg_req = (ZenithNblocksRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
 		case T_ZenithGetPageRequest:
-		{
-			ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
-
-			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
-			appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
-							 msg_req->rnode.spcNode,
-							 msg_req->rnode.dbNode,
-							 msg_req->rnode.relNode);
-			appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-			appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
-			appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
-			appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+			{
+				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageRequest\"");
+				appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
+								 msg_req->rnode.spcNode,
+								 msg_req->rnode.dbNode,
+								 msg_req->rnode.relNode);
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
-		/* pagestore -> pagestore_client */
+			/* pagestore -> pagestore_client */
 		case T_ZenithExistsResponse:
-		{
-			ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
+			{
+				ZenithExistsResponse *msg_resp = (ZenithExistsResponse *) msg;
 
-			appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
-			appendStringInfo(&s, ", \"exists\": %d}",
-				msg_resp->exists
-			);
-			appendStringInfoChar(&s, '}');
+				appendStringInfoString(&s, "{\"type\": \"ZenithExistsResponse\"");
+				appendStringInfo(&s, ", \"exists\": %d}",
+								 msg_resp->exists
+					);
+				appendStringInfoChar(&s, '}');
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithNblocksResponse:
-		{
-			ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
+			{
+				ZenithNblocksResponse *msg_resp = (ZenithNblocksResponse *) msg;
 
-			appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
-			appendStringInfo(&s, ", \"n_blocks\": %u}",
-				msg_resp->n_blocks
-			);
-			appendStringInfoChar(&s, '}');
+				appendStringInfoString(&s, "{\"type\": \"ZenithNblocksResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks
+					);
+				appendStringInfoChar(&s, '}');
 
-			break;
-		}
+				break;
+			}
 		case T_ZenithGetPageResponse:
-		{
+			{
 #if 0
-			ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
+				ZenithGetPageResponse *msg_resp = (ZenithGetPageResponse *) msg;
 #endif
 
-			appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
-			appendStringInfo(&s, ", \"page\": \"XXX\"}");
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+				appendStringInfoString(&s, "{\"type\": \"ZenithGetPageResponse\"");
+				appendStringInfo(&s, ", \"page\": \"XXX\"}");
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 		case T_ZenithErrorResponse:
-		{
-			ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
+			{
+				ZenithErrorResponse *msg_resp = (ZenithErrorResponse *) msg;
 
-			/* FIXME: escape double-quotes in the message */
-			appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
-			appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
-			appendStringInfoChar(&s, '}');
-			break;
-		}
+				/* FIXME: escape double-quotes in the message */
+				appendStringInfoString(&s, "{\"type\": \"ZenithErrorResponse\"");
+				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 
 		default:
 			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
@@ -328,7 +329,7 @@ log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 	PGAlignedBlock copied_buffer;
 
 	/* set the flag in the original page, like log_newpage() does. */
-	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
+	((PageHeader) page)->pd_flags |= PD_WAL_LOGGED;
 
 	memcpy(copied_buffer.data, page, BLCKSZ);
 	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
@@ -338,19 +339,20 @@ log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 static void
 zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
 {
-	XLogRecPtr lsn = PageGetLSN(buffer);
+	XLogRecPtr	lsn = PageGetLSN(buffer);
 
 	if (ShutdownRequestPending)
 		return;
 
 	/*
-	 * If the page was not WAL-logged before eviction then we can lose its modification.
-	 * PD_WAL_LOGGED bit is used to mark pages which are wal-logged.
+	 * If the page was not WAL-logged before eviction then we can lose its
+	 * modification. PD_WAL_LOGGED bit is used to mark pages which are
+	 * wal-logged.
 	 *
 	 * See also comments to PD_WAL_LOGGED.
 	 *
-	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the whole index .
-	 * That's duplicative with the WAL-logging that we do here.
+	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the
+	 * whole index. That's duplicative with the WAL-logging that we do here.
 	 * See log_newpage_range() calls.
 	 *
 	 * FIXME: Redoing this record will set the LSN on the page. That could
@@ -359,7 +361,8 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
 	{
 		/* FSM is never WAL-logged and we don't care. */
-		XLogRecPtr recptr;
+		XLogRecPtr	recptr;
+
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
@@ -368,18 +371,19 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
+			 forknum, (uint32) lsn);
 	}
 	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
 	{
 		/*
-		 * Always WAL-log vm.
-		 * We should never miss clearing visibility map bits.
+		 * Always WAL-log vm. We should never miss clearing visibility map
+		 * bits.
 		 *
-		 * TODO Is it too bad for performance?
-		 * Hopefully we do not evict actively used vm too often.
+		 * TODO Is it too bad for performance? Hopefully we do not evict
+		 * actively used vm too often.
 		 */
-		XLogRecPtr recptr;
+		XLogRecPtr	recptr;
+
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
@@ -389,45 +393,48 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
+			 forknum, (uint32) lsn);
 	}
-	else if (!(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED)
-		&& !RecoveryInProgress())
+	else if (!(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED)
+			 && !RecoveryInProgress())
 	{
-		XLogRecPtr recptr;
+		XLogRecPtr	recptr;
+
 		/*
 		 * We assume standard page layout here.
 		 *
 		 * But at smgr level we don't really know what kind of a page this is.
-		 * We have filtered visibility map pages and fsm pages above.
-		 * TODO Do we have any special page types?
+		 * We have filtered visibility map pages and fsm pages above. TODO Do
+		 * we have any special page types?
 		 */
 
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
 
-		/* If we wal-log hint bits, someone could concurrently update page
-		 * and reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
+		/*
+		 * If we wal-log hint bits, someone could concurrently update page and
+		 * reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
 		 *
-		 * See comment to FlushBuffer().
-		 * The caller must hold a pin on the buffer and have share-locked the
-		 * buffer contents.  (Note: a share-lock does not prevent updates of
-		 * hint bits in the buffer, so the page could change while the write
-		 * is in progress, but we assume that that will not invalidate the data
-		 * written.)
+		 * See comment to FlushBuffer(). The caller must hold a pin on the
+		 * buffer and have share-locked the buffer contents.  (Note: a
+		 * share-lock does not prevent updates of hint bits in the buffer, so
+		 * the page could change while the write is in progress, but we assume
+		 * that that will not invalidate the data written.)
 		 */
-		Assert(((PageHeader)buffer)->pd_flags & PD_WAL_LOGGED); /* Should be set by log_newpage */
+		Assert(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED);	/* Should be set by
+																	 * log_newpage */
 
 		/*
-		 * Need to flush it too, so that it gets sent to the Page Server before we
-		 * might need to read it back. It should get flushed eventually anyway, at
-		 * least if there is some other WAL activity, so this isn't strictly
-		 * necessary for correctness. But if there is no other WAL activity, the
-		 * page read might get stuck waiting for the record to be streamed out
-		 * for an indefinite time.
+		 * Need to flush it too, so that it gets sent to the Page Server
+		 * before we might need to read it back. It should get flushed
+		 * eventually anyway, at least if there is some other WAL activity, so
+		 * this isn't strictly necessary for correctness. But if there is no
+		 * other WAL activity, the page read might get stuck waiting for the
+		 * record to be streamed out for an indefinite time.
 		 *
-		 * FIXME: Flushing the WAL is expensive. We should track the last "evicted"
-		 * LSN instead, and update it here. Or just kick the bgwriter to do the
-		 * flush, there is no need for us to block here waiting for it to finish.
+		 * FIXME: Flushing the WAL is expensive. We should track the last
+		 * "evicted" LSN instead, and update it here. Or just kick the
+		 * bgwriter to do the flush, there is no need for us to block here
+		 * waiting for it to finish.
 		 */
 		XLogFlush(recptr);
 		lsn = recptr;
@@ -436,14 +443,16 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
-	} else {
+			 forknum, (uint32) lsn);
+	}
+	else
+	{
 		elog(SmgrTrace, "Page %u of relation %u/%u/%u.%u is alread wal logged at lsn=%X",
 			 blocknum,
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32)lsn);
+			 forknum, (uint32) lsn);
 	}
 	SetLastWrittenPageLSN(lsn);
 }
@@ -472,14 +481,15 @@ zenith_init(void)
 static XLogRecPtr
 zm_adjust_lsn(XLogRecPtr lsn)
 {
-	/* If lsn points to the beging of first record on page or segment,
-	 * then "return" it back to the page origin
+	/*
+	 * If lsn points to the beging of first record on page or segment, then
+	 * "return" it back to the page origin
 	 */
-	if ((lsn & (XLOG_BLCKSZ-1)) == SizeOfXLogShortPHD)
+	if ((lsn & (XLOG_BLCKSZ - 1)) == SizeOfXLogShortPHD)
 	{
 		lsn -= SizeOfXLogShortPHD;
 	}
-	else if ((lsn & (wal_segment_size-1)) == SizeOfXLogLongPHD)
+	else if ((lsn & (wal_segment_size - 1)) == SizeOfXLogLongPHD)
 	{
 		lsn -= SizeOfXLogLongPHD;
 	}
@@ -492,13 +502,13 @@ zm_adjust_lsn(XLogRecPtr lsn)
 static XLogRecPtr
 zenith_get_request_lsn(bool *latest)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	if (RecoveryInProgress())
 	{
 		lsn = GetXLogReplayRecPtr(NULL);
 		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
-			(uint32) ((lsn) >> 32), (uint32) (lsn));
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 		lsn = InvalidXLogRecPtr;
 	}
 	else if (am_walsender)
@@ -508,7 +518,7 @@ zenith_get_request_lsn(bool *latest)
 	}
 	else
 	{
-		XLogRecPtr flushlsn;
+		XLogRecPtr	flushlsn;
 
 		/*
 		 * Use the latest LSN that was evicted from the buffer cache. Any
@@ -518,14 +528,15 @@ zenith_get_request_lsn(bool *latest)
 		lsn = GetLastWrittenPageLSN();
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
-			(uint32) ((lsn) >> 32), (uint32) (lsn));
+			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 
 		lsn = zm_adjust_lsn(lsn);
 
 		/*
-		 * Is it possible that the last-written LSN is ahead of last flush LSN? Probably not,
-		 * we shouldn't evict a page from the buffer cache before all its modifications have
-		 * been safely flushed. That's the "WAL before data" rule. But better safe than sorry.
+		 * Is it possible that the last-written LSN is ahead of last flush
+		 * LSN? Probably not, we shouldn't evict a page from the buffer cache
+		 * before all its modifications have been safely flushed. That's the
+		 * "WAL before data" rule. But better safe than sorry.
 		 */
 		flushlsn = GetFlushRecPtr();
 		if (lsn > flushlsn)
@@ -538,8 +549,8 @@ zenith_get_request_lsn(bool *latest)
 	}
 
 	/*
-	 * FIXME: In read-only mode, we would need to set *latest=false here. But we don't
-	 * support read-only mode at the moment
+	 * FIXME: In read-only mode, we would need to set *latest=false here. But
+	 * we don't support read-only mode at the moment
 	 */
 	*latest = true;
 	return lsn;
@@ -566,10 +577,12 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			.rnode = reln->smgr_rnode.node,
 			.forknum = forkNum
 		};
+
 		resp = page_server->request((ZenithRequest *) &request);
 	}
 
-	switch (resp->tag) {
+	switch (resp->tag)
+	{
 		case T_ZenithExistsResponse:
 			exists = ((ZenithExistsResponse *) resp)->exists;
 			break;
@@ -583,7 +596,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 							reln->smgr_rnode.node.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s", 
+					 errdetail("page server returned error: %s",
 							   ((ZenithErrorResponse *) resp)->message)));
 			break;
 
@@ -653,10 +666,10 @@ void
 zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			  char *buffer, bool skipFsync)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
-	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno+1);
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -721,7 +734,7 @@ zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
  */
 void
 zenith_writeback(SMgrRelation reln, ForkNumber forknum,
-					  BlockNumber blocknum, BlockNumber nblocks)
+				 BlockNumber blocknum, BlockNumber nblocks)
 {
 	/* not implemented */
 	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
@@ -741,7 +754,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 {
 	ZenithResponse *resp;
 	bool		latest;
-	XLogRecPtr request_lsn;
+	XLogRecPtr	request_lsn;
 
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
@@ -753,10 +766,12 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			.forknum = forkNum,
 			.blkno = blkno
 		};
+
 		resp = page_server->request((ZenithRequest *) &request);
 	}
 
-	switch (resp->tag) {
+	switch (resp->tag)
+	{
 		case T_ZenithGetPageResponse:
 			memcpy(buffer, ((ZenithGetPageResponse *) resp)->page, BLCKSZ);
 			break;
@@ -771,7 +786,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 							reln->smgr_rnode.node.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s", 
+					 errdetail("page server returned error: %s",
 							   ((ZenithErrorResponse *) resp)->message)));
 			break;
 
@@ -782,22 +797,24 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	pfree(resp);
 
 	/* Clear PD_WAL_LOGGED bit stored in WAL record */
-	((PageHeader)buffer)->pd_flags &= ~PD_WAL_LOGGED;
+	((PageHeader) buffer)->pd_flags &= ~PD_WAL_LOGGED;
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
-		char pageserver_masked[BLCKSZ];
-		char mdbuf[BLCKSZ];
-		char mdbuf_masked[BLCKSZ];
+		char		pageserver_masked[BLCKSZ];
+		char		mdbuf[BLCKSZ];
+		char		mdbuf_masked[BLCKSZ];
 
 		mdread(reln, forkNum, blkno, mdbuf);
 
 		memcpy(pageserver_masked, buffer, BLCKSZ);
 		memcpy(mdbuf_masked, mdbuf, BLCKSZ);
 
-		if (PageIsNew(mdbuf)) {
-			if (!PageIsNew(pageserver_masked)) {
+		if (PageIsNew(mdbuf))
+		{
+			if (!PageIsNew(pageserver_masked))
+			{
 				elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
 					 blkno,
 					 reln->smgr_rnode.node.spcNode,
@@ -808,23 +825,25 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 					 hexdump_page(buffer));
 			}
 		}
-		else if (PageIsNew(buffer)) {
+		else if (PageIsNew(buffer))
+		{
 			elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-					 blkno,
-					 reln->smgr_rnode.node.spcNode,
-					 reln->smgr_rnode.node.dbNode,
-					 reln->smgr_rnode.node.relNode,
-					 forkNum,
-					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-					 hexdump_page(mdbuf));
+				 blkno,
+				 reln->smgr_rnode.node.spcNode,
+				 reln->smgr_rnode.node.dbNode,
+				 reln->smgr_rnode.node.relNode,
+				 forkNum,
+				 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+				 hexdump_page(mdbuf));
 		}
 		else if (PageGetSpecialSize(mdbuf) == 0)
 		{
-			// assume heap
+			/* assume heap */
 			RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno);
 			RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
 
-			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+			if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+			{
 				elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 					 blkno,
 					 reln->smgr_rnode.node.spcNode,
@@ -840,11 +859,12 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		{
 			if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID)
 			{
-				// assume btree
+				/* assume btree */
 				RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno);
 				RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
 
-				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) {
+				if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0)
+				{
 					elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
 						 blkno,
 						 reln->smgr_rnode.node.spcNode,
@@ -875,7 +895,7 @@ hexdump_page(char *page)
 			appendStringInfo(&result, " ");
 		if (i % 40 == 0)
 			appendStringInfo(&result, "\n");
-		appendStringInfo(&result, "%02x", (unsigned char)(page[i]));
+		appendStringInfo(&result, "%02x", (unsigned char) (page[i]));
 	}
 
 	return result.data;
@@ -893,7 +913,7 @@ void
 zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			 char *buffer, bool skipFsync)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	zenith_wallog_page(reln, forknum, blocknum, buffer);
 
@@ -938,7 +958,8 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		resp = page_server->request((ZenithRequest *) &request);
 	}
 
-	switch (resp->tag) {
+	switch (resp->tag)
+	{
 		case T_ZenithNblocksResponse:
 			n_blocks = ((ZenithNblocksResponse *) resp)->n_blocks;
 			break;
@@ -952,7 +973,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 							reln->smgr_rnode.node.relNode,
 							forknum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
-					 errdetail("page server returned error: %s", 
+					 errdetail("page server returned error: %s",
 							   ((ZenithErrorResponse *) resp)->message)));
 			break;
 
@@ -979,17 +1000,17 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 void
 zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	XLogRecPtr lsn;
+	XLogRecPtr	lsn;
 
 	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
 
 	/*
-	 * Truncating a relation drops all its buffers from the buffer cache without
-	 * calling smgrwrite() on them. But we must account for that in our tracking
-	 * of last-written-LSN all the same: any future smgrnblocks() request must
-	 * return the new size after the truncation. We don't know what the LSN of
-	 * the truncation record was, so be conservative and use the most recently
-	 * inserted WAL record's LSN.
+	 * Truncating a relation drops all its buffers from the buffer cache
+	 * without calling smgrwrite() on them. But we must account for that in
+	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
+	 * request must return the new size after the truncation. We don't know
+	 * what the LSN of the truncation record was, so be conservative and use
+	 * the most recently inserted WAL record's LSN.
 	 */
 	lsn = GetXLogInsertRecPtr();
 
diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
index 5cb86e116a7..eb5b3f45a34 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/zenith/relsize_cache.c
@@ -33,15 +33,21 @@ typedef struct
 
 typedef struct
 {
-	RelTag tag;
+	RelTag		tag;
 	BlockNumber size;
 } RelSizeEntry;
 
 static HTAB *relsize_hash;
 static LWLockId relsize_lock;
-static int relsize_hash_size;
+static int	relsize_hash_size;
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
+/*
+ * Size of cache entry is 20 bytes. So 64 entry will take about 1.2 Mb,
+ * which seems to be a reasonable default.
+ */
+#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
+
 static void
 zenith_smgr_shmem_startup(void)
 {
@@ -51,7 +57,7 @@ zenith_smgr_shmem_startup(void)
 		prev_shmem_startup_hook();
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId)GetNamedLWLockTranche("zenith_relsize");
+	relsize_lock = (LWLockId) GetNamedLWLockTranche("zenith_relsize");
 	info.keysize = sizeof(RelTag);
 	info.entrysize = sizeof(RelSizeEntry);
 	relsize_hash = ShmemInitHash("zenith_relsize",
@@ -62,13 +68,14 @@ zenith_smgr_shmem_startup(void)
 }
 
 bool
-get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size)
+get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
 {
-	bool found = false;
+	bool		found = false;
+
 	if (relsize_hash_size > 0)
 	{
-		RelTag tag;
-		RelSizeEntry* entry;
+		RelTag		tag;
+		RelSizeEntry *entry;
 
 		tag.rnode = rnode;
 		tag.forknum = forknum;
@@ -89,8 +96,8 @@ set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 {
 	if (relsize_hash_size > 0)
 	{
-		RelTag tag;
-		RelSizeEntry* entry;
+		RelTag		tag;
+		RelSizeEntry *entry;
 
 		tag.rnode = rnode;
 		tag.forknum = forknum;
@@ -106,9 +113,9 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 {
 	if (relsize_hash_size > 0)
 	{
-		RelTag tag;
-		RelSizeEntry* entry;
-		bool found;
+		RelTag		tag;
+		RelSizeEntry *entry;
+		bool		found;
 
 		tag.rnode = rnode;
 		tag.forknum = forknum;
@@ -127,17 +134,12 @@ relsize_hash_init(void)
 							"Sets the maximum number of cached relation sizes for zenith",
 							NULL,
 							&relsize_hash_size,
-							/* 
-							 * Size of cache entry is 20 bytes.
-							 * So 64 entry will take about 1.2 Mb,
-							 * which seems to be a reasonable default.
-							 */
-							64*1024,
+							DEFAULT_RELSIZE_HASH_SIZE,
 							0,
 							INT_MAX,
 							PGC_POSTMASTER,
 							0,
-							NULL, NULL,	NULL);
+							NULL, NULL, NULL);
 
 	if (relsize_hash_size > 0)
 	{
@@ -147,4 +149,4 @@ relsize_hash_init(void)
 		prev_shmem_startup_hook = shmem_startup_hook;
 		shmem_startup_hook = zenith_smgr_shmem_startup;
 	}
-}
\ No newline at end of file
+}
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5eab36461f8..b2448102aa7 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -55,63 +55,65 @@
 #include "utils/timestamp.h"
 
 
-char* wal_acceptors_list;
-int   wal_acceptor_reconnect_timeout;
-bool  am_wal_proposer;
+char	   *wal_acceptors_list;
+int			wal_acceptor_reconnect_timeout;
+bool		am_wal_proposer;
 
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
-WalProposerFunctionsType* WalProposerFunctions = NULL;
+WalProposerFunctionsType *WalProposerFunctions = NULL;
 
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
-static int          n_walkeepers = 0;
-static int          quorum = 0;
-static WalKeeper    walkeeper[MAX_WALKEEPERS];
-static WalMessage*  msgQueueHead;
-static WalMessage*  msgQueueTail;
-static XLogRecPtr	lastSentLsn;	/* WAL has been appended to msg queue up to this point */
-static XLogRecPtr	lastSentCommitLsn;	/* last commitLsn broadcast to walkeepers */
-static ProposerGreeting   proposerGreeting;
-static WaitEventSet* waitEvents;
+static int	n_walkeepers = 0;
+static int	quorum = 0;
+static WalKeeper walkeeper[MAX_WALKEEPERS];
+static WalMessage *msgQueueHead;
+static WalMessage *msgQueueTail;
+static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
+								 * this point */
+static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
+										 * walkeepers */
+static ProposerGreeting proposerGreeting;
+static WaitEventSet *waitEvents;
 static AppendResponse lastFeedback;
 /*
  *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
  *  + 1 of last chunk streamed to everyone)
  */
-static XLogRecPtr   truncateLsn;
-static XLogRecPtr   candidateTruncateLsn;
+static XLogRecPtr truncateLsn;
+static XLogRecPtr candidateTruncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
-static term_t       propTerm; /* term of the proposer */
-static XLogRecPtr   propEpochStartLsn;    /* epoch start lsn of the proposer */
-static term_t		donorEpoch; /* Most advanced acceptor epoch */
-static int          donor;     /* Most advanced acceptor */
-static int          n_votes = 0;
-static int          n_connected = 0;
-static TimestampTz  last_reconnect_attempt;
+static term_t propTerm;			/* term of the proposer */
+static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
+static term_t donorEpoch;		/* Most advanced acceptor epoch */
+static int	donor;				/* Most advanced acceptor */
+static int	n_votes = 0;
+static int	n_connected = 0;
+static TimestampTz last_reconnect_attempt;
 
 /* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
-static bool         syncSafekeepers;
+static bool syncSafekeepers;
 
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
-static bool AsyncRead(int i, void* value, size_t value_size);
-static bool BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state);
-static bool AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
+static bool AsyncRead(int i, void *value, size_t value_size);
+static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
+static bool AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
 static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
 static void HackyRemoveWalProposerEvent(int to_remove);
-static WalMessage* CreateMessageCommitLsnOnly(XLogRecPtr lsn);
-static void BroadcastMessage(WalMessage* msg);
+static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void BroadcastMessage(WalMessage *msg);
 
 
 /*
  * Combine hot standby feedbacks from all walkeepers.
  */
 static void
-CombineHotStanbyFeedbacks(HotStandbyFeedback* hs)
+CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 {
 	hs->ts = 0;
-	hs->xmin.value = ~0; /* largest unsigned value */
-	hs->catalog_xmin.value = ~0; /* largest unsigned value */
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
 
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -154,7 +156,7 @@ InitEventSet(void)
  * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
  */
 static void
-UpdateEventSet(WalKeeper* wk, uint32 events)
+UpdateEventSet(WalKeeper *wk, uint32 events)
 {
 	/* eventPos = -1 when we don't have an event */
 	Assert(wk->eventPos != -1);
@@ -170,19 +172,23 @@ static void
 HackyRemoveWalProposerEvent(int to_remove)
 {
 	/* Remove the existing event set */
-	if (waitEvents) {
+	if (waitEvents)
+	{
 		FreeWaitEventSet(waitEvents);
 		waitEvents = NULL;
 	}
 	/* Re-initialize it without adding any walkeeper events */
 	InitEventSet();
 
-	/* loop through the existing walkeepers. If they aren't the one we're removing, and if they have
-	 * a socket we can use, re-add the applicable events. */
+	/*
+	 * loop through the existing walkeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		uint32 desired_events = WL_NO_EVENTS;
-		WalKeeper* wk = &walkeeper[i];
+		uint32		desired_events = WL_NO_EVENTS;
+		WalKeeper  *wk = &walkeeper[i];
 
 		wk->eventPos = -1;
 
@@ -219,58 +225,75 @@ ShutdownConnection(int i)
 static void
 ResetConnection(int i)
 {
-	pgsocket sock; /* socket of the new connection */
-	WalKeeper *wk = &walkeeper[i];
+	pgsocket	sock;			/* socket of the new connection */
+	WalKeeper  *wk = &walkeeper[i];
 
 	if (wk->state != SS_OFFLINE)
 	{
 		ShutdownConnection(i);
 	}
 
-	/* Try to establish new connection
+	/*
+	 * Try to establish new connection
 	 *
 	 * If the connection information hasn't been filled out, we need to do
-	 * that here. */
+	 * that here.
+	 */
 	if (wk->conninfo[0] == '\0')
 	{
-		sprintf((char*) &wk->conninfo,
+		sprintf((char *) &wk->conninfo,
 				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
 				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	}
 
-	wk->conn = walprop_connect_start((char*) &wk->conninfo);
+	wk->conn = walprop_connect_start((char *) &wk->conninfo);
 
-	/* "If the result is null, then libpq has been unable to allocate a new PGconn structure" */
+	/*
+	 * "If the result is null, then libpq has been unable to allocate a new
+	 * PGconn structure"
+	 */
 	if (!wk->conn)
 		elog(FATAL, "failed to allocate new PGconn object");
 
-	/* PQconnectStart won't actually start connecting until we run PQconnectPoll. Before we do that
-	 * though, we need to check that it didn't immediately fail. */
+	/*
+	 * PQconnectStart won't actually start connecting until we run
+	 * PQconnectPoll. Before we do that though, we need to check that it
+	 * didn't immediately fail.
+	 */
 	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
 	{
-		/* According to libpq docs:
-		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed, typically
-		 *    because of invalid connection parameters."
+		/*---
+		 * According to libpq docs:
+		 *   "If the result is CONNECTION_BAD, the connection attempt has already failed,
+		 *    typically because of invalid connection parameters."
 		 * We should report this failure.
 		 *
-		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS */
+		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
+		 */
 		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
 			 wk->conninfo, walprop_error_message(wk->conn));
-		/* Even though the connection failed, we still need to clean up the object */
+
+		/*
+		 * Even though the connection failed, we still need to clean up the
+		 * object
+		 */
 		walprop_finish(wk->conn);
 		wk->conn = NULL;
 		return;
 	}
 
-	/* The documentation for PQconnectStart states that we should call PQconnectPoll in a loop until
-	 * it returns PGRES_POLLING_OK or PGRES_POLLING_FAILED. The other two possible returns indicate
-	 * whether we should wait for reading or writing on the socket. For the first iteration of the
-	 * loop, we're expected to wait until the socket becomes writable.
+	/*
+	 * The documentation for PQconnectStart states that we should call
+	 * PQconnectPoll in a loop until it returns PGRES_POLLING_OK or
+	 * PGRES_POLLING_FAILED. The other two possible returns indicate whether
+	 * we should wait for reading or writing on the socket. For the first
+	 * iteration of the loop, we're expected to wait until the socket becomes
+	 * writable.
 	 *
-	 * The wording of the documentation is a little ambiguous; thankfully there's an example in the
-	 * postgres source itself showing this behavior.
-	 *   (see libpqrcv_connect, defined in
-	 *              src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
+	 * The wording of the documentation is a little ambiguous; thankfully
+	 * there's an example in the postgres source itself showing this behavior.
+	 * (see libpqrcv_connect, defined in
+	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
 	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
 
@@ -287,17 +310,18 @@ ResetConnection(int i)
 static XLogRecPtr
 GetAcknowledgedByQuorumWALPosition(void)
 {
-	XLogRecPtr responses[MAX_WALKEEPERS];
+	XLogRecPtr	responses[MAX_WALKEEPERS];
+
 	/*
 	 * Sort acknowledged LSNs
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
 		/*
-		 * Note that while we haven't pushed WAL up to epoch start lsn to the majority we
-		 * don't really know which LSN is reliably committed as reported
-		 * flush_lsn is physical end of wal, which can contain diverged
-		 * history (compared to donor).
+		 * Note that while we haven't pushed WAL up to epoch start lsn to the
+		 * majority we don't really know which LSN is reliably committed as
+		 * reported flush_lsn is physical end of wal, which can contain
+		 * diverged history (compared to donor).
 		 */
 		responses[i] = walkeeper[i].feedback.epoch == propTerm
 			? walkeeper[i].feedback.flushLsn : 0;
@@ -314,7 +338,7 @@ static void
 HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
-	XLogRecPtr minQuorumLsn;
+	XLogRecPtr	minQuorumLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
@@ -340,21 +364,24 @@ HandleWalKeeperResponse(void)
 	/* Cleanup message queue */
 	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1))
 	{
-		WalMessage* msg = msgQueueHead;
+		WalMessage *msg = msgQueueHead;
+
 		msgQueueHead = msg->next;
+
 		/*
 		 * This piece is received by everyone; try to advance truncateLsn, but
 		 * hold it back to nearest commitLsn. Thus we will always start
-		 * streaming from the beginning of the record, which simplifies decoding
-		 * on the far end.
+		 * streaming from the beginning of the record, which simplifies
+		 * decoding on the far end.
 		 *
 		 * This also prevents surprising violation of truncateLsn <= commitLsn
 		 * invariant which might occur because 1) truncateLsn can be advanced
-		 * immediately once chunk is broadcast to all safekeepers, and commitLsn
-		 * generally can't be advanced based on feedback from safekeeper who is
-		 * still in the previous epoch (similar to 'leader can't commit entries
-		 * from previous term' in Raft); 2) chunks we read from WAL and send are
-		 * plain sheets of bytes, but safekeepers ack only on commit boundaries.
+		 * immediately once chunk is broadcast to all safekeepers, and
+		 * commitLsn generally can't be advanced based on feedback from
+		 * safekeeper who is still in the previous epoch (similar to 'leader
+		 * can't commit entries from previous term' in Raft); 2) chunks we
+		 * read from WAL and send are plain sheets of bytes, but safekeepers
+		 * ack only on commit boundaries.
 		 */
 		if (msg->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
 		{
@@ -375,20 +402,20 @@ HandleWalKeeperResponse(void)
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
-	if (!msgQueueHead) /* queue is empty */
+	if (!msgQueueHead)			/* queue is empty */
 		msgQueueTail = NULL;
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are ready
-	 * to give all WAL to pageserver. It would mean whichever majority is alive,
-	 * there will be at least one safekeeper who is able to stream WAL to
-	 * pageserver to make basebackup possible. However, since at the moment we
-	 * don't have any good mechanism of defining the healthy and most advanced
-	 * safekeeper who should push the wal into pageserver and basically the
-	 * random one gets connected, to prevent hanging basebackup (due to
-	 * pageserver connecting to not-synced-walkeeper) we currently wait for all
-	 * seemingly alive walkeepers to get synced.
+	 * epochStartLsn and made the majority aware of it, ensuring they are
+	 * ready to give all WAL to pageserver. It would mean whichever majority
+	 * is alive, there will be at least one safekeeper who is able to stream
+	 * WAL to pageserver to make basebackup possible. However, since at the
+	 * moment we don't have any good mechanism of defining the healthy and
+	 * most advanced safekeeper who should push the wal into pageserver and
+	 * basically the random one gets connected, to prevent hanging basebackup
+	 * (due to pageserver connecting to not-synced-walkeeper) we currently
+	 * wait for all seemingly alive walkeepers to get synced.
 	 */
 	if (syncSafekeepers)
 	{
@@ -397,8 +424,8 @@ HandleWalKeeperResponse(void)
 		n_synced = 0;
 		for (int i = 0; i < n_walkeepers; i++)
 		{
-			WalKeeper *wk = &walkeeper[i];
-			bool synced = wk->feedback.commitLsn >= propEpochStartLsn;
+			WalKeeper  *wk = &walkeeper[i];
+			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
 
 			/* alive safekeeper which is not synced yet; wait for it */
 			if (wk->state != SS_OFFLINE && !synced)
@@ -415,16 +442,16 @@ HandleWalKeeperResponse(void)
 	}
 }
 
-char *zenith_timeline_walproposer = NULL;
-char *zenith_tenant_walproposer = NULL;
+char	   *zenith_timeline_walproposer = NULL;
+char	   *zenith_tenant_walproposer = NULL;
 
 
 static void
 WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 {
-	char* host;
-	char* sep;
-	char* port;
+	char	   *host;
+	char	   *sep;
+	char	   *port;
 
 	/* Load the libpq-specific functions */
 	load_file("libpqwalproposer", false);
@@ -439,14 +466,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
-		if (port == NULL) {
+		if (port == NULL)
+		{
 			elog(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
 		if (sep != NULL)
 			*sep++ = '\0';
-		if (n_walkeepers+1 >= MAX_WALKEEPERS)
+		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
 		{
 			elog(FATAL, "Too many walkeepers");
 		}
@@ -454,7 +482,11 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		walkeeper[n_walkeepers].port = port;
 		walkeeper[n_walkeepers].state = SS_OFFLINE;
 		walkeeper[n_walkeepers].conn = NULL;
-		/* Set conninfo to empty. We'll fill it out once later, in `ResetConnection` as needed */
+
+		/*
+		 * Set conninfo to empty. We'll fill it out once later, in
+		 * `ResetConnection` as needed
+		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		walkeeper[n_walkeepers].currMsg = NULL;
 		n_walkeepers += 1;
@@ -463,7 +495,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	{
 		elog(FATAL, "WalKeepers addresses are not specified");
 	}
-	quorum = n_walkeepers/2 + 1;
+	quorum = n_walkeepers / 2 + 1;
 
 	/* Fill the greeting package */
 	proposerGreeting.tag = 'g';
@@ -474,12 +506,12 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
-	 !HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
+		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
-	 !HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
+		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
 	proposerGreeting.timeline = ThisTimeLineID;
 	proposerGreeting.walSegSize = wal_segment_size;
@@ -526,7 +558,8 @@ WalProposerMain(Datum main_arg)
 
 	last_reconnect_attempt = GetCurrentTimestamp();
 
-	application_name = (char *) "walproposer"; /* for synchronous_standby_names */
+	application_name = (char *) "walproposer";	/* for
+												 * synchronous_standby_names */
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
@@ -596,6 +629,7 @@ static void
 WalProposerStartStreaming(XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
+
 	elog(LOG, "WAL proposer starts streaming at %X/%X",
 		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
@@ -611,15 +645,16 @@ WalProposerStartStreaming(XLogRecPtr startpos)
  * these before calling would be redundant work.
  */
 static void
-SendMessageToNode(int i, WalMessage* msg)
+SendMessageToNode(int i, WalMessage *msg)
 {
-	WalKeeper* wk = &walkeeper[i];
+	WalKeeper  *wk = &walkeeper[i];
 
 	/* we shouldn't be already sending something */
 	Assert(wk->currMsg == NULL);
+
 	/*
-	 * Skip already acknowledged messages. Used after reconnection to get to the
-	 * first not yet sent message. Otherwise we always just send 'msg'.
+	 * Skip already acknowledged messages. Used after reconnection to get to
+	 * the first not yet sent message. Otherwise we always just send 'msg'.
 	 */
 	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 		msg = msg->next;
@@ -632,7 +667,10 @@ SendMessageToNode(int i, WalMessage* msg)
 		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
 		wk->currMsg->req.truncateLsn = truncateLsn;
 
-		/* Once we've selected and set up our message, actually start sending it. */
+		/*
+		 * Once we've selected and set up our message, actually start sending
+		 * it.
+		 */
 		wk->state = SS_SEND_WAL;
 		/* Don't ned to update the event set; that's done by AdvancePollState */
 
@@ -649,7 +687,7 @@ SendMessageToNode(int i, WalMessage* msg)
  * Broadcast new message to all caught-up walkeepers
  */
 static void
-BroadcastMessage(WalMessage* msg)
+BroadcastMessage(WalMessage *msg)
 {
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -660,12 +698,13 @@ BroadcastMessage(WalMessage* msg)
 	}
 }
 
-static WalMessage*
-CreateMessage(XLogRecPtr startpos, char* data, int len)
+static WalMessage *
+CreateMessage(XLogRecPtr startpos, char *data, int len)
 {
 	/* Create new message and append it to message queue */
-	WalMessage*	msg;
-	XLogRecPtr endpos;
+	WalMessage *msg;
+	XLogRecPtr	endpos;
+
 	len -= XLOG_HDR_SIZE;
 	endpos = startpos + len;
 	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
@@ -674,7 +713,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 		return NULL;
 	}
 	Assert(len >= 0);
-	msg = (WalMessage*)malloc(sizeof(WalMessage) + len);
+	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
 	if (msgQueueTail != NULL)
 		msgQueueTail->next = msg;
 	else
@@ -691,7 +730,7 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
 	msg->req.proposerId = proposerGreeting.proposerId;
-	memcpy(&msg->req+1, data + XLOG_HDR_SIZE, len);
+	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
 
 	Assert(msg->req.endLsn >= lastSentLsn);
 	lastSentLsn = msg->req.endLsn;
@@ -699,9 +738,10 @@ CreateMessage(XLogRecPtr startpos, char* data, int len)
 }
 
 void
-WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
+WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
 {
-	WalMessage* msg = CreateMessage(startpos, data, len);
+	WalMessage *msg = CreateMessage(startpos, data, len);
+
 	if (msg != NULL)
 		BroadcastMessage(msg);
 }
@@ -710,13 +750,13 @@ WalProposerBroadcast(XLogRecPtr startpos, char* data, int len)
  * Create WAL message with no data, just to let the walkeepers
  * know that commit lsn has advanced.
  */
-static WalMessage*
+static WalMessage *
 CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 {
 	/* Create new message and append it to message queue */
-	WalMessage*	msg;
+	WalMessage *msg;
 
-	msg = (WalMessage*)malloc(sizeof(WalMessage));
+	msg = (WalMessage *) malloc(sizeof(WalMessage));
 	if (msgQueueTail != NULL)
 		msgQueueTail->next = msg;
 	else
@@ -730,20 +770,24 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
+
 	/*
-     * This serves two purposes:
-	 * 1) After all msgs from previous epochs are pushed we queue empty
-     *    WalMessage with lsn set to epochStartLsn which commands to switch the
-     *    epoch, which allows to do the switch without creating new epoch
-     *    records (we especially want to avoid such in --sync mode).
-	 *    Walproposer can advance commit_lsn only after the switch, so this lsn
-	 *    (reported back) also is the first possible advancement point.
+	 * This serves two purposes: 1) After all msgs from previous epochs are
+	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
+	 * commands to switch the epoch, which allows to do the switch without
+	 * creating new epoch records (we especially want to avoid such in --sync
+	 * mode). Walproposer can advance commit_lsn only after the switch, so
+	 * this lsn (reported back) also is the first possible advancement point.
 	 * 2) Maintain common invariant of queue entries sorted by LSN.
 	 */
 	msg->req.beginLsn = lsn;
 	msg->req.endLsn = lsn;
 	msg->req.proposerId = proposerGreeting.proposerId;
-	/* truncateLsn and commitLsn are set just before the message sent, in SendMessageToNode() */
+
+	/*
+	 * truncateLsn and commitLsn are set just before the message sent, in
+	 * SendMessageToNode()
+	 */
 	return msg;
 }
 
@@ -779,10 +823,10 @@ DetermineEpochStartLsn(void)
 	}
 
 	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
-	 * committed yet. To keep the idea of always starting streaming since record
-	 * boundary (which simplifies decoding on safekeeper), take start position
-	 * of the slot.
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
+	 * was committed yet. To keep the idea of always starting streaming since
+	 * record boundary (which simplifies decoding on safekeeper), take start
+	 * position of the slot.
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
@@ -791,10 +835,11 @@ DetermineEpochStartLsn(void)
 		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
 	}
+
 	/*
-	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to some
-	 * connected safekeeper; it must have carried truncateLsn pointing to the
-	 * first record.
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
+	 * some connected safekeeper; it must have carried truncateLsn pointing to
+	 * the first record.
 	 */
 	Assert((truncateLsn != InvalidXLogRecPtr) ||
 		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
@@ -834,6 +879,7 @@ static void
 ReconnectWalKeepers(void)
 {
 	TimestampTz now = GetCurrentTimestamp();
+
 	if (TimeToReconnect(now) == 0)
 	{
 		last_reconnect_attempt = now;
@@ -851,8 +897,8 @@ ReconnectWalKeepers(void)
 static bool
 WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
 {
-	char conninfo[MAXCONNINFO];
-	char *err;
+	char		conninfo[MAXCONNINFO];
+	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 
@@ -880,18 +926,19 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 
 	if (walrcv_startstreaming(wrconn, &options))
 	{
-		XLogRecPtr rec_start_lsn;
-		XLogRecPtr rec_end_lsn = 0;
-		int len;
-		char *buf;
-		pgsocket wait_fd = PGINVALID_SOCKET;
+		XLogRecPtr	rec_start_lsn;
+		XLogRecPtr	rec_end_lsn = 0;
+		int			len;
+		char	   *buf;
+		pgsocket	wait_fd = PGINVALID_SOCKET;
+
 		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
 		{
 			if (len == 0)
 			{
 				(void) WaitLatchOrSocket(
-					MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-					-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
 			}
 			else
 			{
@@ -915,15 +962,15 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	{
 		ereport(LOG,
 				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
-						timeline, (uint32)(startpos >> 32), (uint32)startpos)));
+						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
 		return false;
 	}
 
 	/*
 	 * Start sending entries to everyone from the beginning (truncateLsn),
-	 * except for those who lives in donor's epoch and thus for sure has correct
-	 * WAL. We could do here even slightly better, taking into account commitLsn
-	 * of the rest to avoid sending them excessive data.
+	 * except for those who lives in donor's epoch and thus for sure has
+	 * correct WAL. We could do here even slightly better, taking into account
+	 * commitLsn of the rest to avoid sending them excessive data.
 	 */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
@@ -945,13 +992,13 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 				}
 				else
 				{
-					uint32 len;
-					uint32 size;
+					uint32		len;
+					uint32		size;
 
 					/*
-					 * By convention we always stream since the beginning of the
-					 * record, and flushLsn points to it -- form the message
-					 * starting there.
+					 * By convention we always stream since the beginning of
+					 * the record, and flushLsn points to it -- form the
+					 * message starting there.
 					 */
 					len = msg->req.endLsn - walkeeper[i].voteResponse.flushLsn;
 					size = sizeof(AppendRequestHeader) + len;
@@ -961,8 +1008,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 						walkeeper[i].voteResponse.flushLsn;
 					memcpy(&msg->perSafekeeper[i] + 1,
 						   (char *) (&msg->req + 1) +
-							   walkeeper[i].voteResponse.flushLsn -
-							   msg->req.beginLsn,
+						   walkeeper[i].voteResponse.flushLsn -
+						   msg->req.beginLsn,
 						   len);
 					SendMessageToNode(i, msg);
 					break;
@@ -981,25 +1028,28 @@ WalProposerPoll(void)
 {
 	while (true)
 	{
-		WalKeeper*  wk;
-		int         rc;
-		int         i;
+		WalKeeper  *wk;
+		int			rc;
+		int			i;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
 
 		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-						&event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		wk = (WalKeeper*) event.user_data;
-		i = (int)(wk - walkeeper);
+							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		wk = (WalKeeper *) event.user_data;
+		i = (int) (wk - walkeeper);
 
 		/*
 		 * If the event contains something that one of our walkeeper states
 		 * was waiting for, we'll advance its state.
 		 */
-		if (rc != 0 && (event.events & (WL_SOCKET_READABLE|WL_SOCKET_WRITEABLE)))
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
 			AdvancePollState(i, event.events);
 
-		/* If the timeout expired, attempt to reconnect to any walkeepers that we dropped */
+		/*
+		 * If the timeout expired, attempt to reconnect to any walkeepers that
+		 * we dropped
+		 */
 		ReconnectWalKeepers();
 
 		/*
@@ -1020,91 +1070,116 @@ WalProposerPoll(void)
 static void
 AdvancePollState(int i, uint32 events)
 {
-	WalKeeper* wk = &walkeeper[i];
+	WalKeeper  *wk = &walkeeper[i];
 
-	/* Keep advancing the state while either:
-	 *   (a) the event is still unprocessed (usually because it's the first
-	 *       iteration of the loop), or
-	 *   (b) the state can execute, and does not need to wait for any socket
-	 *       events
+	/*
+	 * Keep advancing the state while either: (a) the event is still
+	 * unprocessed (usually because it's the first iteration of the loop), or
+	 * (b) the state can execute, and does not need to wait for any socket
+	 * events
 	 */
 	while (events || StateShouldImmediatelyExecute(wk->state))
 	{
-		/* Sanity check. We assume further down that the operations don't block
-		 * because the socket is ready. */
+		/*
+		 * Sanity check. We assume further down that the operations don't
+		 * block because the socket is ready.
+		 */
 		AssertEventsOkForState(events, wk);
 
 		/* Execute the code corresponding to the current state */
 		switch (wk->state)
 		{
-			/* WAL keepers are only taken out of SS_OFFLINE by calls to
-			 * ResetConnection */
+				/*
+				 * WAL keepers are only taken out of SS_OFFLINE by calls to
+				 * ResetConnection
+				 */
 			case SS_OFFLINE:
 				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
 					 wk->host, wk->port);
-				break; /* actually unreachable, but prevents -Wimplicit-fallthrough */
+				break;			/* actually unreachable, but prevents
+								 * -Wimplicit-fallthrough */
 
-			/* Both connecting states run the same logic. The only difference is
-			 * the events they're expecting */
+				/*
+				 * Both connecting states run the same logic. The only
+				 * difference is the events they're expecting
+				 */
 			case SS_CONNECTING_READ:
 			case SS_CONNECTING_WRITE:
-			{
-				WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
-
-				/* The new set of events we'll wait on, after updating */
-				uint32 new_events = WL_NO_EVENTS;
-
-				switch (result)
 				{
-					case WP_CONN_POLLING_OK:
-						elog(LOG, "connected with node %s:%s", wk->host,
-							 wk->port);
-
-						/* Once we're fully connected, we can move to the next state */
-						wk->state = SS_EXEC_STARTWALPUSH;
-
-						/* Even though SS_EXEC_STARTWALPUSH doesn't wait on anything,
-						 * we do need to replace the current event, so we have to
-						 * just pick something. We'll eventually need the socket to
-						 * be readable, so we go with that. */
-						new_events = WL_SOCKET_READABLE;
-						break;
+					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
 
-					/* If we need to poll to finish connecting, continue doing that */
-					case WP_CONN_POLLING_READING:
-						wk->state = SS_CONNECTING_READ;
-						new_events = WL_SOCKET_READABLE;
-						break;
-					case WP_CONN_POLLING_WRITING:
-						wk->state = SS_CONNECTING_WRITE;
-						new_events = WL_SOCKET_WRITEABLE;
-						break;
+					/* The new set of events we'll wait on, after updating */
+					uint32		new_events = WL_NO_EVENTS;
 
-					case WP_CONN_POLLING_FAILED:
-						elog(WARNING, "Failed to connect to node '%s:%s': %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
-						/* If connecting failed, we don't want to restart the connection because
-						 * that might run us into a loop. Instead, shut it down -- it'll naturally
-						 * restart at a slower interval on calls to ReconnectWalKeepers. */
-						ShutdownConnection(i);
-						return;
-				}
+					switch (result)
+					{
+						case WP_CONN_POLLING_OK:
+							elog(LOG, "connected with node %s:%s", wk->host,
+								 wk->port);
+
+							/*
+							 * Once we're fully connected, we can move to the
+							 * next state
+							 */
+							wk->state = SS_EXEC_STARTWALPUSH;
+
+							/*
+							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
+							 * on anything, we do need to replace the current
+							 * event, so we have to just pick something. We'll
+							 * eventually need the socket to be readable, so
+							 * we go with that.
+							 */
+							new_events = WL_SOCKET_READABLE;
+							break;
+
+							/*
+							 * If we need to poll to finish connecting,
+							 * continue doing that
+							 */
+						case WP_CONN_POLLING_READING:
+							wk->state = SS_CONNECTING_READ;
+							new_events = WL_SOCKET_READABLE;
+							break;
+						case WP_CONN_POLLING_WRITING:
+							wk->state = SS_CONNECTING_WRITE;
+							new_events = WL_SOCKET_WRITEABLE;
+							break;
+
+						case WP_CONN_POLLING_FAILED:
+							elog(WARNING, "Failed to connect to node '%s:%s': %s",
+								 wk->host, wk->port, walprop_error_message(wk->conn));
+
+							/*
+							 * If connecting failed, we don't want to restart
+							 * the connection because that might run us into a
+							 * loop. Instead, shut it down -- it'll naturally
+							 * restart at a slower interval on calls to
+							 * ReconnectWalKeepers.
+							 */
+							ShutdownConnection(i);
+							return;
+					}
 
-				/* Because PQconnectPoll can change the socket, we have to
-				 * un-register the old event and re-register an event on the new
-				 * socket. */
-				HackyRemoveWalProposerEvent(i);
-				wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
-				break;
-			}
+					/*
+					 * Because PQconnectPoll can change the socket, we have to
+					 * un-register the old event and re-register an event on
+					 * the new socket.
+					 */
+					HackyRemoveWalProposerEvent(i);
+					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
+					break;
+				}
 
-			/* Send "START_WAL_PUSH" command to the walkeeper. After sending,
-			 * wait for response with SS_WAIT_EXEC_RESULT */
+				/*
+				 * Send "START_WAL_PUSH" command to the walkeeper. After
+				 * sending, wait for response with SS_WAIT_EXEC_RESULT
+				 */
 			case SS_EXEC_STARTWALPUSH:
 				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
+						 wk->host, wk->port, walprop_error_message(wk->conn));
 					ResetConnection(i);
 					return;
 				}
@@ -1116,59 +1191,88 @@ AdvancePollState(int i, uint32 events)
 			case SS_WAIT_EXEC_RESULT:
 				switch (walprop_get_query_result(wk->conn))
 				{
-					/* Successful result, move on to starting the handshake */
+						/*
+						 * Successful result, move on to starting the
+						 * handshake
+						 */
 					case WP_EXEC_SUCCESS_COPYBOTH:
-						/* Because this state is immediately executable, we'll
-						 * start this on the next iteration of the loop */
+
+						/*
+						 * Because this state is immediately executable, we'll
+						 * start this on the next iteration of the loop
+						 */
 						wk->state = SS_HANDSHAKE_SEND;
 						break;
 
-					/* Needs repeated calls to finish. Wait until the socket is
-					 * readable */
+						/*
+						 * Needs repeated calls to finish. Wait until the
+						 * socket is readable
+						 */
 					case WP_EXEC_NEEDS_INPUT:
-						/* SS_WAIT_EXEC_RESULT is always reached through an
-						 * event, so we don't need to update the event set */
+
+						/*
+						 * SS_WAIT_EXEC_RESULT is always reached through an
+						 * event, so we don't need to update the event set
+						 */
 						break;
 
 					case WP_EXEC_FAILED:
 						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-								wk->host, wk->port, walprop_error_message(wk->conn));
+							 wk->host, wk->port, walprop_error_message(wk->conn));
 						ResetConnection(i);
 						return;
 
-					/* Unexpected result -- funamdentally an error, but we want to produce a custom
-					 * message, rather than a generic "something went wrong" */
+						/*
+						 * Unexpected result -- funamdentally an error, but we
+						 * want to produce a custom message, rather than a
+						 * generic "something went wrong"
+						 */
 					case WP_EXEC_UNEXPECTED_SUCCESS:
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
-								wk->host, wk->port);
+							 wk->host, wk->port);
 						ResetConnection(i);
 						return;
 				}
 				break;
 
-			/* Start handshake: first of all send information about the WAL
-			 * keeper. After sending, we wait on SS_HANDSHAKE_RECV for a
-			 * response to finish the handshake. */
+				/*
+				 * Start handshake: first of all send information about the
+				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+				 * a response to finish the handshake.
+				 */
 			case SS_HANDSHAKE_SEND:
-				/* On failure, logging & resetting the connection is handled. We
-				 * just need to handle the control flow. */
+
+				/*
+				 * On failure, logging & resetting the connection is handled.
+				 * We just need to handle the control flow.
+				 */
 				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
 					return;
 
 				break;
 
-			/* Finish handshake comms: receive information about the WAL keeper */
+				/*
+				 * Finish handshake comms: receive information about the WAL
+				 * keeper
+				 */
 			case SS_HANDSHAKE_RECV:
-				/* If our reading doesn't immediately succeed, any necessary error handling or state
-				 * setting is taken care of. We can leave any other work until later. */
+
+				/*
+				 * If our reading doesn't immediately succeed, any necessary
+				 * error handling or state setting is taken care of. We can
+				 * leave any other work until later.
+				 */
 				if (!AsyncRead(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
 				/* Protocol is all good, move to voting. */
 				wk->state = SS_VOTING;
-				/* Don't need to update the event set yet. Either we update the
-				 * event set to WL_SOCKET_READABLE *or* we change the state to
-				 * SS_SEND_VOTE in the loop below */
+
+				/*
+				 * Don't need to update the event set yet. Either we update
+				 * the event set to WL_SOCKET_READABLE *or* we change the
+				 * state to SS_SEND_VOTE in the loop below
+				 */
 				UpdateEventSet(wk, WL_SOCKET_READABLE);
 				wk->feedback.flushLsn = truncateLsn;
 				wk->feedback.hs.ts = 0;
@@ -1179,14 +1283,19 @@ AdvancePollState(int i, uint32 events)
 				 */
 				propTerm = Max(walkeeper[i].greet.term, propTerm);
 
-				/* Check if we have quorum. If there aren't enough walkeepers, wait and do nothing.
-				 * We'll eventually get a task when the election starts.
+				/*
+				 * Check if we have quorum. If there aren't enough walkeepers,
+				 * wait and do nothing. We'll eventually get a task when the
+				 * election starts.
 				 *
-				 * If we do have quorum, we can start an election */
+				 * If we do have quorum, we can start an election
+				 */
 				if (++n_connected < quorum)
 				{
-					/* SS_VOTING is an idle state; read-ready indicates the
-					 * connection closed. */
+					/*
+					 * SS_VOTING is an idle state; read-ready indicates the
+					 * connection closed.
+					 */
 					UpdateEventSet(wk, WL_SOCKET_READABLE);
 				}
 				else
@@ -1195,19 +1304,26 @@ AdvancePollState(int i, uint32 events)
 					{
 						propTerm++;
 						/* prepare voting message */
-						voteRequest = (VoteRequest) {
+						voteRequest = (VoteRequest)
+						{
 							.tag = 'v',
 							.term = propTerm
 						};
 						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
 					}
 
-					/* Now send voting request to the cohort and wait responses */
+					/*
+					 * Now send voting request to the cohort and wait
+					 * responses
+					 */
 					for (int j = 0; j < n_walkeepers; j++)
 					{
-						/* Remember: SS_VOTING indicates that the walkeeper is participating in
-						 * voting, but hasn't sent anything yet. The ones that have sent something
-						 * are given SS_SEND_VOTE or SS_WAIT_VERDICT. */
+						/*
+						 * Remember: SS_VOTING indicates that the walkeeper is
+						 * participating in voting, but hasn't sent anything
+						 * yet. The ones that have sent something are given
+						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
+						 */
 						if (walkeeper[j].state == SS_VOTING)
 						{
 							walkeeper[j].state = SS_SEND_VOTE;
@@ -1218,16 +1334,18 @@ AdvancePollState(int i, uint32 events)
 				}
 				break;
 
-			/* Voting is an idle state - we don't expect any events to trigger. Refer to the
-			 * execution of SS_HANDSHAKE_RECV to see how nodes are transferred from SS_VOTING to
-			 * SS_SEND_VOTE. */
+				/*
+				 * Voting is an idle state - we don't expect any events to
+				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
+				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
+				 */
 			case SS_VOTING:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
 				ResetConnection(i);
 				break;
 
-			/* We have quorum for voting, send our vote request */
+				/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
 				/* On failure, logging & resetting is handled */
 				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
@@ -1236,10 +1354,14 @@ AdvancePollState(int i, uint32 events)
 				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 				break;
 
-			/* Start reading the walkeeper response for our candidate */
+				/* Start reading the walkeeper response for our candidate */
 			case SS_WAIT_VERDICT:
-				/* If our reading doesn't immediately succeed, any necessary error handling or state
-				 * setting is taken care of. We can leave any other work until later. */
+
+				/*
+				 * If our reading doesn't immediately succeed, any necessary
+				 * error handling or state setting is taken care of. We can
+				 * leave any other work until later.
+				 */
 				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
 					return;
 
@@ -1259,8 +1381,8 @@ AdvancePollState(int i, uint32 events)
 					(wk->voteResponse.term > propTerm || n_votes < quorum))
 				{
 					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-						wk->host, wk->port,
-						wk->voteResponse.term, propTerm);
+						 wk->host, wk->port,
+						 wk->voteResponse.term, propTerm);
 				}
 				Assert(wk->voteResponse.term == propTerm);
 
@@ -1268,17 +1390,24 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes != quorum)
 				{
-					/* We are already streaming WAL: send all pending messages to the attached walkeeper */
+					/*
+					 * We are already streaming WAL: send all pending messages
+					 * to the attached walkeeper
+					 */
 					SendMessageToNode(i, msgQueueHead);
 				}
 				else
 				{
 					wk->state = SS_IDLE;
-					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for read-ready */
+					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+															 * read-ready */
 
 					DetermineEpochStartLsn();
 
-					/* Check if not all safekeepers are up-to-date, we need to download WAL needed to synchronize them */
+					/*
+					 * Check if not all safekeepers are up-to-date, we need to
+					 * download WAL needed to synchronize them
+					 */
 					if (truncateLsn < propEpochStartLsn)
 					{
 						elog(LOG,
@@ -1289,11 +1418,12 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
+
 						/*
-						 * This message signifies epoch switch; it is needed to
-						 * make the switch happen on donor, as he won't get any
-						 * other messages until we start writing new WAL (and we
-						 * e.g. don't in --sync mode at all)
+						 * This message signifies epoch switch; it is needed
+						 * to make the switch happen on donor, as he won't get
+						 * any other messages until we start writing new WAL
+						 * (and we e.g. don't in --sync mode at all)
 						 */
 						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
@@ -1315,101 +1445,124 @@ AdvancePollState(int i, uint32 events)
 
 				break;
 
-			/* Idle state for sending WAL. Moved out only by calls to
-			 * SendMessageToNode */
+				/*
+				 * Idle state for sending WAL. Moved out only by calls to
+				 * SendMessageToNode
+				 */
 			case SS_IDLE:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
 				ResetConnection(i);
 				break;
 
-			/* Start to send the message at wk->currMsg. Triggered only by calls
-			 * to SendMessageToNode */
+				/*
+				 * Start to send the message at wk->currMsg. Triggered only by
+				 * calls to SendMessageToNode
+				 */
 			case SS_SEND_WAL:
-			{
-				WalMessage* msg = wk->currMsg;
-				AppendRequestHeader *req = &msg->req;
+				{
+					WalMessage *msg = wk->currMsg;
+					AppendRequestHeader *req = &msg->req;
 
-				/* if there is a message specially crafted for this safekeeper, send it */
-				if (msg->perSafekeeper[i])
-					req = msg->perSafekeeper[i];
+					/*
+					 * if there is a message specially crafted for this
+					 * safekeeper, send it
+					 */
+					if (msg->perSafekeeper[i])
+						req = msg->perSafekeeper[i];
 
-				elog(LOG,
-					 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-					 msg->size - sizeof(AppendRequestHeader),
-					 LSN_FORMAT_ARGS(req->beginLsn),
-					 LSN_FORMAT_ARGS(req->commitLsn),
-					 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
-
-				/* We write with msg->size here because the body of the message
-				 * is stored after the end of the WalMessage struct, in the
-				 * allocation for each msg */
-				if (!AsyncWrite(i, req,
-								sizeof(AppendRequestHeader) + req->endLsn -
+					elog(LOG,
+						 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						 msg->size - sizeof(AppendRequestHeader),
+						 LSN_FORMAT_ARGS(req->beginLsn),
+						 LSN_FORMAT_ARGS(req->commitLsn),
+						 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+
+					/*
+					 * We write with msg->size here because the body of the
+					 * message is stored after the end of the WalMessage
+					 * struct, in the allocation for each msg
+					 */
+					if (!AsyncWrite(i, req,
+									sizeof(AppendRequestHeader) + req->endLsn -
 									req->beginLsn,
-								SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
-					return;
+									SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+						return;
 
-				break;
-			}
+					break;
+				}
 
-			/* Flush the WAL message we're sending from SS_SEND_WAL */
+				/* Flush the WAL message we're sending from SS_SEND_WAL */
 			case SS_SEND_WAL_FLUSH:
-				/* AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll wait
-				 * until the next poll comes along. */
+
+				/*
+				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll
+				 * wait until the next poll comes along.
+				 */
 				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0, SS_RECV_FEEDBACK))
 					return;
 
 				break;
 
-			/* Start to receive the feedback from a message sent via SS_SEND_WAL */
+				/*
+				 * Start to receive the feedback from a message sent via
+				 * SS_SEND_WAL
+				 */
 			case SS_RECV_FEEDBACK:
-			{
-				WalMessage* next;
-				XLogRecPtr  minQuorumLsn;
-
-				/* If our reading doesn't immediately succeed, any necessary error handling or state
-				 * setting is taken care of. We can leave any other work until later. */
-				if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
-					return;
+				{
+					WalMessage *next;
+					XLogRecPtr	minQuorumLsn;
 
-				next = wk->currMsg->next;
-				wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms receiving of this message */
+					/*
+					 * If our reading doesn't immediately succeed, any
+					 * necessary error handling or state setting is taken care
+					 * of. We can leave any other work until later.
+					 */
+					if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
+						return;
 
-				wk->currMsg = NULL;
-				HandleWalKeeperResponse();
-				SendMessageToNode(i, next); /* Updates state & event set */
+					next = wk->currMsg->next;
+					wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms
+													 * receiving of this
+													 * message */
 
-				/*
-				 * Also send the new commit lsn to all the walkeepers.
-				 *
-				 * FIXME: This is redundant for walkeepers that have other outbound messages
-				 * pending.
-				 */
-				minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+					wk->currMsg = NULL;
+					HandleWalKeeperResponse();
+					SendMessageToNode(i, next); /* Updates state & event set */
 
-				if (minQuorumLsn > lastSentCommitLsn)
-				{
-					BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
 					/*
-					 * commitLsn is always the record boundary; remember it so
-					 * we can advance truncateLsn there. But do so only if
-					 * previous value is applied, otherwise it might never catch
-					 * up.
+					 * Also send the new commit lsn to all the walkeepers.
+					 *
+					 * FIXME: This is redundant for walkeepers that have other
+					 * outbound messages pending.
 					 */
-					if (candidateTruncateLsn == InvalidXLogRecPtr)
+					minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+
+					if (minQuorumLsn > lastSentCommitLsn)
 					{
-						candidateTruncateLsn = minQuorumLsn;
+						BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+
+						/*
+						 * commitLsn is always the record boundary; remember
+						 * it so we can advance truncateLsn there. But do so
+						 * only if previous value is applied, otherwise it
+						 * might never catch up.
+						 */
+						if (candidateTruncateLsn == InvalidXLogRecPtr)
+						{
+							candidateTruncateLsn = minQuorumLsn;
+						}
+						lastSentCommitLsn = minQuorumLsn;
 					}
-					lastSentCommitLsn = minQuorumLsn;
+					break;
 				}
-				break;
-			}
 		}
 
-		/* We've already done something for these events - don't attempt more
-		 * states than we need to. */
+		/*
+		 * We've already done something for these events - don't attempt more
+		 * states than we need to.
+		 */
 		events = WL_NO_EVENTS;
 	}
 }
@@ -1423,44 +1576,47 @@ AdvancePollState(int i, uint32 events)
  * failed, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncRead(int i, void* value, size_t value_size)
+AsyncRead(int i, void *value, size_t value_size)
 {
-	WalKeeper* wk = &walkeeper[i];
-	char *buf = NULL;
-	int buf_size = -1;
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	char	   *buf = NULL;
+	int			buf_size = -1;
+	uint32		events;
 
 	switch (walprop_async_read(wk->conn, &buf, &buf_size))
 	{
-		/* On success, there's just a couple more things we'll check below */
+			/* On success, there's just a couple more things we'll check below */
 		case PG_ASYNC_READ_SUCCESS:
 			break;
 
-		/* If we need more input, wait until the socket is read-ready and try
-		 * again. */
+			/*
+			 * If we need more input, wait until the socket is read-ready and
+			 * try again.
+			 */
 		case PG_ASYNC_READ_TRY_AGAIN:
 			UpdateEventSet(wk, WL_SOCKET_READABLE);
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
 			elog(WARNING, "Failed to read from node %s:%s in %s state: %s",
-				wk->host, wk->port,
-				FormatWalKeeperState(wk->state),
-				walprop_error_message(wk->conn));
+				 wk->host, wk->port,
+				 FormatWalKeeperState(wk->state),
+				 walprop_error_message(wk->conn));
 			ResetConnection(i);
 			return false;
 	}
 
 	/*
-	 * If we get here, the read was ok, but we still need to check it was the right amount
+	 * If we get here, the read was ok, but we still need to check it was the
+	 * right amount
 	 */
 	if ((size_t) buf_size != value_size)
 	{
 		elog(FATAL,
-			"Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
-			wk->host, wk->port,
-			FormatWalKeeperState(wk->state),
-			value_size, buf_size);
+			 "Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
+			 wk->host, wk->port,
+			 FormatWalKeeperState(wk->state),
+			 value_size, buf_size);
 	}
 
 	/* Copy the resulting info into place */
@@ -1481,10 +1637,10 @@ AsyncRead(int i, void* value, size_t value_size)
  * single packet.
  */
 static bool
-BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
+BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 {
-	WalKeeper* wk = &walkeeper[i];
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	uint32		events;
 
 	if (!walprop_blocking_write(wk->conn, msg, msg_size))
 	{
@@ -1497,8 +1653,10 @@ BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
 
 	wk->state = success_state;
 
-	/* If the new state will be waiting for events to happen, update the event
-	 * set to wait for those */
+	/*
+	 * If the new state will be waiting for events to happen, update the event
+	 * set to wait for those
+	 */
 	events = WalKeeperStateDesiredEvents(success_state);
 	if (events)
 		UpdateEventSet(wk, events);
@@ -1515,10 +1673,10 @@ BlockingWrite(int i, void* msg, size_t msg_size, WalKeeperState success_state)
  * emitted and the connection is reset.
  */
 static bool
-AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
+AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
 {
-	WalKeeper* wk = &walkeeper[i];
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	uint32		events;
 
 	switch (walprop_async_write(wk->conn, msg, msg_size))
 	{
@@ -1526,9 +1684,12 @@ AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKee
 			wk->state = success_state;
 			break;
 		case PG_ASYNC_WRITE_TRY_FLUSH:
-			/* We still need to call PQflush some more to finish the job; go to
-			 * the appropriate state. Update the event set at the bottom of this
-			 * function */
+
+			/*
+			 * We still need to call PQflush some more to finish the job; go
+			 * to the appropriate state. Update the event set at the bottom of
+			 * this function
+			 */
 			wk->state = flush_state;
 			break;
 		case PG_ASYNC_WRITE_FAIL:
@@ -1559,10 +1720,11 @@ AsyncWrite(int i, void* msg, size_t msg_size, WalKeeperState flush_state, WalKee
 static bool
 AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
 {
-	WalKeeper* wk = &walkeeper[i];
-	uint32 events;
+	WalKeeper  *wk = &walkeeper[i];
+	uint32		events;
 
-	/* PQflush returns:
+	/*---
+	 * PQflush returns:
 	 *   0 if successful                    [we're good to move on]
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index d71d1adbecd..5bc3539d783 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2159,6 +2159,8 @@ RelMapFile
 RelMapping
 RelOptInfo
 RelOptKind
+RelSizeEntry
+RelTag
 RelToCheck
 RelToCluster
 RelabelType
@@ -2846,6 +2848,8 @@ WaitEventTimeout
 WaitPMResult
 WalCloseMethod
 WalLevel
+WalKeeper
+WalMessage
 WalRcvData
 WalRcvExecResult
 WalRcvExecStatus
@@ -2949,6 +2953,17 @@ XmlTableBuilderData
 YYLTYPE
 YYSTYPE
 YY_BUFFER_STATE
+ZenithErrorResponse
+ZenithExistsRequest
+ZenithExistsResponse
+ZenithGetPageRequest
+ZenithGetPageResponse
+ZenithMessage
+ZenithMessageTag
+ZenithNblocksRequest
+ZenithNblocksResponse
+ZenithRequest
+ZenithResponse
 _SPI_connection
 _SPI_plan
 __AssignProcessToJobObject

From 316ff47f8eeb69f89c08061b39711ee4026c4289 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 17 Sep 2021 19:00:29 +0300
Subject: [PATCH 059/167] Silence compiler warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    contrib/zenith/libpagestore.c: In function ‘zenith_connect’:
    contrib/zenith/libpagestore.c:125:2: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
      125 |  const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
          |  ^~~~~
---
 contrib/zenith/libpagestore.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 9fe6e2aea62..1dc708f0ad7 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -119,22 +119,24 @@ zenith_connect()
 	 * copy values from PQconninfoOption to key/value arrays because
 	 * PQconnectdbParams accepts options this way
 	 */
-	const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
-	const char **values = malloc((noptions + 1) * sizeof(*values));
-	int			i = 0;
-
-	for (i = 0; i < noptions; i++)
 	{
-		keywords[i] = conn_options[i].keyword;
-		values[i] = conn_options[i].val;
-	}
-	/* add array terminator */
-	keywords[i] = NULL;
-	values[i] = NULL;
+		const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
+		const char **values = malloc((noptions + 1) * sizeof(*values));
+		int			i = 0;
+
+		for (i = 0; i < noptions; i++)
+		{
+			keywords[i] = conn_options[i].keyword;
+			values[i] = conn_options[i].val;
+		}
+		/* add array terminator */
+		keywords[i] = NULL;
+		values[i] = NULL;
 
-	pageserver_conn = PQconnectdbParams(keywords, values, false);
-	free(keywords);
-	free(values);
+		pageserver_conn = PQconnectdbParams(keywords, values, false);
+		free(keywords);
+		free(values);
+	}
 
 	PQconninfoFree(conn_options);
 

From 2ea36fdbd785770f03bf40b8530474a97212f142 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 20 Sep 2021 15:00:32 +0300
Subject: [PATCH 060/167] Fix a badly worded comment

---
 contrib/zenith/relsize_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
index eb5b3f45a34..0ba99a128f9 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/zenith/relsize_cache.c
@@ -43,8 +43,8 @@ static int	relsize_hash_size;
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
 
 /*
- * Size of cache entry is 20 bytes. So 64 entry will take about 1.2 Mb,
- * which seems to be a reasonable default.
+ * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
+ * which seems reasonable.
  */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
 

From fd61df8389db4e9ae8eed5c66d43c036c0a2931d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 16 Sep 2021 15:20:03 +0300
Subject: [PATCH 061/167] Simplify a2e929e by storing starting point in
 walkeeper itself.

---
 src/backend/replication/walproposer.c | 63 ++++++++++++++-------------
 src/include/replication/walproposer.h | 15 +++----
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index b2448102aa7..35624a77352 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -394,11 +394,6 @@ HandleWalKeeperResponse(void)
 			truncateLsn = candidateTruncateLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			if (msg->perSafekeeper[i])
-				free(msg->perSafekeeper[i]);
-		}
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
@@ -489,6 +484,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		walkeeper[n_walkeepers].currMsg = NULL;
+		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_walkeepers += 1;
 	}
 	if (n_walkeepers < 1)
@@ -723,7 +719,6 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
 	msg->ackMask = 0;
-	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -766,7 +761,6 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
 	msg->ackMask = 0;
-	memset(&msg->perSafekeeper, '\0', sizeof(msg->perSafekeeper));
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -992,25 +986,11 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 				}
 				else
 				{
-					uint32		len;
-					uint32		size;
-
 					/*
 					 * By convention we always stream since the beginning of
-					 * the record, and flushLsn points to it -- form the
-					 * message starting there.
+					 * the record, and flushLsn points to it.
 					 */
-					len = msg->req.endLsn - walkeeper[i].voteResponse.flushLsn;
-					size = sizeof(AppendRequestHeader) + len;
-					msg->perSafekeeper[i] = malloc(size);
-					*msg->perSafekeeper[i] = msg->req;
-					msg->perSafekeeper[i]->beginLsn =
-						walkeeper[i].voteResponse.flushLsn;
-					memcpy(&msg->perSafekeeper[i] + 1,
-						   (char *) (&msg->req + 1) +
-						   walkeeper[i].voteResponse.flushLsn -
-						   msg->req.beginLsn,
-						   len);
+					walkeeper[i].startStreamingAt = walkeeper[i].voteResponse.flushLsn;
 					SendMessageToNode(i, msg);
 					break;
 				}
@@ -1307,7 +1287,7 @@ AdvancePollState(int i, uint32 events)
 						voteRequest = (VoteRequest)
 						{
 							.tag = 'v',
-							.term = propTerm
+								.term = propTerm
 						};
 						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
 					}
@@ -1465,16 +1445,33 @@ AdvancePollState(int i, uint32 events)
 					AppendRequestHeader *req = &msg->req;
 
 					/*
-					 * if there is a message specially crafted for this
-					 * safekeeper, send it
+					 * If we need to send this message not from the beginning,
+					 * form the cut version. Only happens for the first
+					 * message.
 					 */
-					if (msg->perSafekeeper[i])
-						req = msg->perSafekeeper[i];
+					if (wk->startStreamingAt > msg->req.beginLsn)
+					{
+						uint32		len;
+						uint32		size;
+
+						Assert(wk->startStreamingAt < req->endLsn);
+
+						len = msg->req.endLsn - wk->startStreamingAt;
+						size = sizeof(AppendRequestHeader) + len;
+						req = malloc(size);
+						*req = msg->req;
+						req->beginLsn = wk->startStreamingAt;
+						memcpy(req + 1,
+							   (char *) (&msg->req + 1) + wk->startStreamingAt -
+							   msg->req.beginLsn,
+							   len);
+					}
 
 					elog(LOG,
-						 "sending message with len %ld beginLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						 msg->size - sizeof(AppendRequestHeader),
+						 "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						 req->endLsn - req->beginLsn,
 						 LSN_FORMAT_ARGS(req->beginLsn),
+						 LSN_FORMAT_ARGS(req->endLsn),
 						 LSN_FORMAT_ARGS(req->commitLsn),
 						 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
@@ -1487,7 +1484,13 @@ AdvancePollState(int i, uint32 events)
 									sizeof(AppendRequestHeader) + req->endLsn -
 									req->beginLsn,
 									SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+					{
+						if (req != &msg->req)
+							free(req);
 						return;
+					}
+					if (req != &msg->req)
+						free(req);
 
 					break;
 				}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index c455d0564e9..222faaea41d 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -258,13 +258,7 @@ struct WalMessage
 {
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
-	uint32 ackMask;        /* mask of receivers acknowledged receiving of this message */
-	/*
-	 * By convention safekeeper starts receiving data since record boundary, we
-	 * may need to send first message not from the chunk beginning for that;
-	 * such trimmed message is formed here.
-	 */
-	AppendRequestHeader *perSafekeeper[MAX_WALKEEPERS];
+	uint32 ackMask; /* mask of receivers acknowledged receiving of this message */
 	AppendRequestHeader req; /* request to walkeeper (message header) */
 
 	/* PHANTOM FIELD:
@@ -327,7 +321,12 @@ typedef struct WalKeeper
 	WalKeeperState     state;         /* walkeeper state machine state */
 	AcceptorGreeting   greet;         /* acceptor greeting  */
 	VoteResponse	   voteResponse;  /* the vote */
-	AppendResponse  feedback;      /* feedback to master */
+	AppendResponse feedback;		  /* feedback to master */
+	/*
+	 * streaming must be started at the record boundary which is saved here, if
+	 * it differs from the chunk start
+	 */
+	XLogRecPtr startStreamingAt;
 } WalKeeper;
 
 

From 484ac781f9a65963be02c431a66dbd78ec74fd62 Mon Sep 17 00:00:00 2001
From: sharnoff <github@max.sharnoff.org>
Date: Wed, 22 Sep 2021 10:54:47 -0700
Subject: [PATCH 062/167] Catch walkeeper ErrorResponse in PQgetCopyData

PQgetCopyData can sometimes indicate that the copy is done if the
backend returns an error response. So while we still expect that the
walkeeper never sends CopyDone, we can't expect it to never produce
errors.
---
 .../libpqwalproposer/libpqwalproposer.c       | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 1b8a53b5066..f538ed9133f 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -273,16 +273,32 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	 *  (> 0) if it was successful; that value is the amount transferred.
 	 *
 	 * The protocol we use between walproposer and walkeeper means that we
-	 * (i.e. walproposer) won't ever receive a message saying that the copy
-	 * is done. */
+	 * *usually* wouldn't expect to see that the copy is done, but this can
+	 * sometimes be triggered by the server returning an ErrorResponse (which
+	 * also happens to have the effect that the copy is done).
+	 */
 	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
 	{
 		case 0:
 			return PG_ASYNC_READ_TRY_AGAIN;
 		case -1:
-			/* As mentioned above; this shouldn't happen */
-			elog(FATAL, "unexpected return -1 from PQgetCopyData");
-			break;
+		{
+			/*
+			 * If we get -1, it's probably because of a server error; the
+			 * walkeeper won't normally send a CopyDone message.
+			 *
+			 * We can check PQgetResult to make sure that the server failed;
+			 * it'll always result in PGRES_FATAL_ERROR
+			 */
+			ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+
+			if (status != PGRES_FATAL_ERROR)
+				elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+
+			/* If there was actually an error, it'll be properly reported by
+			 * calls to PQerrorMessage -- we don't have to do anything else */
+			return PG_ASYNC_READ_FAIL;
+		}
 		case -2:
 			return PG_ASYNC_READ_FAIL;
 		default:

From 5edfd254b0f746c84428e846430e9e2111ea84a6 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 24 Sep 2021 19:48:36 +0300
Subject: [PATCH 063/167] Use buffered I/O for reading commands from stdin.

Whatever the bug mentioned in the FIXME comment was with buffered I/O,
it has been fixed now. This greatly reduces the amount of CPU time spent
in WAL redo.
---
 src/backend/tcop/zenith_wal_redo.c | 74 ++++++++----------------------
 1 file changed, 18 insertions(+), 56 deletions(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 15db900cc8a..be8cc59a94b 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -109,6 +109,7 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(exit_group),
 		PG_SCMP_ALLOW(pselect6),
 		PG_SCMP_ALLOW(read),
+		PG_SCMP_ALLOW(fstat), /* needed by fread() */
 		PG_SCMP_ALLOW(select),
 		PG_SCMP_ALLOW(write),
 
@@ -433,84 +434,45 @@ pprint_tag(BufferTag *tag)
  *	EOF is returned if end-of-file input is seen; time to shut down.
  * ----------------
  */
-
-/*
- * Wait until there is data in stdin. Prints a log message every 10 s whil
- * waiting.
- */
-static void
-wait_with_timeout(void)
-{
-	for (;;)
-	{
-		struct timeval timeout = {10, 0};
-		fd_set		fds;
-		int			ret;
-
-		FD_ZERO(&fds);
-		FD_SET(STDIN_FILENO, &fds);
-
-		ret = select(1, &fds, NULL, NULL, &timeout);
-		if (ret != 0)
-			break;
-		elog(DEBUG1, "still alive");
-	}
-}
-
 static int
 ReadRedoCommand(StringInfo inBuf)
 {
-	char		c;
+	char		hdr[1 + sizeof(int32)];
 	int			qtype;
 	int32		len;
-	int			nread;
-
-	/* FIXME: Use unbuffered I/O here, because the WAL redo process was getting
-	 * stuck with buffered I/O. I'm not sure why, or whether the bug was somewhere
-	 * in here or in the calling page server side.
-	 */
-	wait_with_timeout();
-	if (read(STDIN_FILENO, &c, 1) == 0)
-		return EOF;
-	qtype = c;
 
-	/*
-	 * Like in the FE/BE protocol, all messages have a length word next
-	 * after the type code; we can read the message contents independently of
-	 * the type.
-	 */
-	if (read(STDIN_FILENO, &len, 4) != 4)
+	/* Read message type and message length */
+	if (fread(hdr, 1, sizeof(hdr), stdin) != sizeof(hdr))
 	{
-		ereport(ERROR,
-				(errcode(ERRCODE_PROTOCOL_VIOLATION),
-				 errmsg("could not read message length")));
+		if (ferror(stdin) != 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("could not read message header")));
+		return EOF;
 	}
-
+	qtype = hdr[0];
+	memcpy(&len, &hdr[1], sizeof(int32));
 	len = pg_ntoh32(len);
 
 	if (len < 4)
-	{
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("invalid message length")));
-		return EOF;
-	}
 
 	len -= 4;					/* discount length itself */
 
+	/* Read the message payload */
 	enlargeStringInfo(inBuf, len);
-	nread = 0;
-	while (nread < len) {
-		int n = read(STDIN_FILENO, inBuf->data + nread, len - nread);
-		if (n == -1)
+	if (fread(inBuf->data, 1, len, stdin) != len)
+	{
+		if (ferror(stdin) != 0)
 			ereport(ERROR,
-					(errcode(ERRCODE_PROTOCOL_VIOLATION),
-					 errmsg("read error: %m")));
-		if (n == 0)
+					(errcode(ERRCODE_CONNECTION_FAILURE),
+					 errmsg("could not read message")));
+		else
 			ereport(ERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
 					 errmsg("unexpected EOF")));
-		nread += n;
 	}
 	inBuf->len = len;
 	inBuf->data[len] = '\0';

From 139a5b6952bf67c97e25716ed8fdb7355adb56f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 30 Sep 2021 10:27:36 +0300
Subject: [PATCH 064/167] Replace fread() with plain read() and a hand-written
 buffer.

The fread() call required allowing the 'fstat' syscall in the seccomp
configuration, and apparently on some platforms also 'newfstatat', as
Max reported this error:

    Sep 28 15:56:55.522 ERRO wal-redo-postgres: ---------------------------------------
    Sep 28 15:56:55.522 ERRO wal-redo-postgres: seccomp: bad syscall 262
    Sep 28 15:56:55.522 ERRO wal-redo-postgres: ---------------------------------------

I'm afraid of allowing 'newfstatat', that seems like it's opening too
much attack surface, since it allows access to files by filename. Maybe
it's OK, but I'm not sure, but there isn't any fundamental reason why
we'd need to call it, I'm not sure why glibc's fread() wants to call it.
So let's avoid the trouble by writing our own simple buffer over plain
read().
---
 src/backend/tcop/zenith_wal_redo.c | 84 +++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 8 deletions(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index be8cc59a94b..a02592fc0c2 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -94,6 +94,7 @@ static void PushPage(StringInfo input_message);
 static void ApplyRecord(StringInfo input_message);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
+static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
@@ -109,7 +110,6 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(exit_group),
 		PG_SCMP_ALLOW(pselect6),
 		PG_SCMP_ALLOW(read),
-		PG_SCMP_ALLOW(fstat), /* needed by fread() */
 		PG_SCMP_ALLOW(select),
 		PG_SCMP_ALLOW(write),
 
@@ -352,6 +352,8 @@ WalRedoMain(int argc, char *argv[],
 				 * EOF means we're done. Perform normal shutdown.
 				 */
 			case EOF:
+				ereport(LOG,
+						(errmsg("received EOF on stdin, shutting down")));
 
 #ifdef HAVE_LIBSECCOMP
 				/*
@@ -437,19 +439,27 @@ pprint_tag(BufferTag *tag)
 static int
 ReadRedoCommand(StringInfo inBuf)
 {
+	ssize_t		ret;
 	char		hdr[1 + sizeof(int32)];
 	int			qtype;
 	int32		len;
 
 	/* Read message type and message length */
-	if (fread(hdr, 1, sizeof(hdr), stdin) != sizeof(hdr))
+	ret = buffered_read(hdr, sizeof(hdr));
+	if (ret != sizeof(hdr))
 	{
-		if (ferror(stdin) != 0)
+		if (ret == 0)
+			return EOF;
+		else if (ret < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONNECTION_FAILURE),
-					 errmsg("could not read message header")));
-		return EOF;
+					 errmsg("could not read message header: %m")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("unexpected EOF")));
 	}
+
 	qtype = hdr[0];
 	memcpy(&len, &hdr[1], sizeof(int32));
 	len = pg_ntoh32(len);
@@ -463,12 +473,13 @@ ReadRedoCommand(StringInfo inBuf)
 
 	/* Read the message payload */
 	enlargeStringInfo(inBuf, len);
-	if (fread(inBuf->data, 1, len, stdin) != len)
+	ret = buffered_read(inBuf->data, len);
+	if (ret != len)
 	{
-		if (ferror(stdin) != 0)
+		if (ret < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONNECTION_FAILURE),
-					 errmsg("could not read message")));
+					 errmsg("could not read message: %m")));
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -679,3 +690,60 @@ GetPage(StringInfo input_message)
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }
+
+
+/* Buffer used by buffered_read() */
+static char stdin_buf[16 * 1024];
+static size_t stdin_len = 0;	/* # of bytes in buffer */
+static size_t stdin_ptr = 0;	/* # of bytes already consumed */
+
+/*
+ * Like read() on stdin, but buffered.
+ *
+ * We cannot use libc's buffered fread(), because it uses syscalls that we
+ * have disabled with seccomp(). Depending on the platform, it can call
+ * 'fstat' or 'newfstatat'. 'fstat' is probably harmless, but 'newfstatat'
+ * seems problematic because it allows interrogating files by path name.
+ *
+ * The return value is the number of bytes read. On error, -1 is returned, and
+ * errno is set appropriately. Unlike read(), this fills the buffer completely
+ * unless an error happens or EOF is reached.
+ */
+static ssize_t
+buffered_read(void *buf, size_t count)
+{
+	char	   *dst = buf;
+
+	while (count > 0)
+	{
+		size_t		nthis;
+
+		if (stdin_ptr == stdin_len)
+		{
+			ssize_t		ret;
+
+			ret = read(STDIN_FILENO, stdin_buf, sizeof(stdin_buf));
+			if (ret < 0)
+			{
+				/* don't do anything here that could set 'errno' */
+				return ret;
+			}
+			if (ret == 0)
+			{
+				/* EOF */
+				break;
+			}
+			stdin_len = (size_t) ret;
+			stdin_ptr = 0;
+		}
+		nthis = Min(stdin_len - stdin_ptr, count);
+
+		memcpy(dst, &stdin_buf[stdin_ptr], nthis);
+
+		stdin_ptr += nthis;
+		count -= nthis;
+		dst += nthis;
+	}
+
+	return (dst - (char *) buf);
+}

From c4bd89efc3fb378616062269dd1246d7b245e21a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 6 Oct 2021 10:57:00 +0300
Subject: [PATCH 065/167] Store unlogged tables locally, and replace
 PD_WAL_LOGGED.

The smgr implementation needs to distinguish between unlogged/temp and
regular 'permanent' relations, but the smgr API doesn't currently include
that information. Add a 'relpersistence' field to SmgrRelationData, and
as an argument to smgropen(). However, not all callers of smgropen()
have a relcache entry at hand, so we allow some operations to pass 0,
meaning 'unknown'.

Now that we can store unlogged tables locally, use the same machinery
to handle the buffered GiST and SP-GiST index builds. They populate the
index by inserting all the tuples, and use the shared buffer cache while
they do that. They don't WAL-log the pages while they do that, they log
the whole relation as a separate bulk operation after the build has
finished. That poses a problem for Zenith, where smgrwrite() is a no-op
and we rely on WAL-logging to reconstruct the pages. Solve that problem by
storing the pages locally in the compute node, like an unlogged relation,
until the index build finishes and all the pages have been WAL-logged.
To do that, the smgr needs to know when the caller is an unlogged build
operation like that, so add functions to the Smgr API for that.

With this commit, we no longer generate an FPI record whenever a rel is
extended with an all-zeros page. See github issue #482. That greatly
reduces the amount of WAL generated during bulk loading.
---
 contrib/zenith/pagestore_client.h        |   1 +
 contrib/zenith/pagestore_smgr.c          | 629 +++++++++++++++++++----
 contrib/zenith/relsize_cache.c           |  15 +
 src/backend/access/common/bufmask.c      |   2 -
 src/backend/access/gin/gininsert.c       |   7 +
 src/backend/access/gist/gistbuild.c      |  15 +-
 src/backend/access/gist/gistutil.c       |   2 -
 src/backend/access/heap/heapam_handler.c |   2 +-
 src/backend/access/spgist/spginsert.c    |   8 +-
 src/backend/access/transam/xloginsert.c  |  15 +-
 src/backend/access/transam/xlogutils.c   |   2 +-
 src/backend/catalog/storage.c            |  10 +-
 src/backend/commands/tablecmds.c         |   8 +-
 src/backend/storage/buffer/bufmgr.c      |  20 +-
 src/backend/storage/buffer/localbuf.c    |   2 +-
 src/backend/storage/page/bufpage.c       |   1 -
 src/backend/storage/smgr/md.c            |   4 +-
 src/backend/storage/smgr/smgr.c          |  43 +-
 src/backend/tcop/zenith_wal_redo.c       |   3 +-
 src/backend/utils/adt/dbsize.c           |  26 +-
 src/include/storage/bufmgr.h             |   2 +
 src/include/storage/bufpage.h            |  19 +-
 src/include/storage/smgr.h               |  13 +-
 src/include/utils/rel.h                  |   3 +-
 24 files changed, 670 insertions(+), 182 deletions(-)

diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 073568f90c3..3643971f254 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -193,5 +193,6 @@ extern void relsize_hash_init(void);
 extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
 extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
 extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
+extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
 
 #endif
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index ac7e94f74c0..99914365428 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -3,6 +3,37 @@
  * pagestore_smgr.c
  *
  *
+ *
+ * Temporary and unlogged rels
+ * ---------------------------
+ *
+ * Temporary and unlogged tables are stored locally, by md.c. The functions
+ * here just pass the calls through to corresponding md.c functions.
+ *
+ * Index build operations that use the buffer cache are also handled locally,
+ * just like unlogged tables. Such operations must be marked by calling
+ * smgr_start_unlogged_build() and friends.
+ *
+ * In order to know what relations are permanent and which ones are not, we
+ * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
+ * by smgropen() callers, when they have the relcache entry at hand.  However,
+ * sometimes we need to open an SmgrRelation for a relation without the
+ * relcache. That is needed when we evict a buffer; we might not have the
+ * SmgrRelation for that relation open yet. To deal with that, the
+ * 'relpersistence' can be left to zero, meaning we don't know if it's
+ * permanent or not. Most operations are not allowed with relpersistence==0,
+ * but smgrwrite() does work, which is what we need for buffer eviction.  and
+ * smgrunlink() so that a backend doesn't need to have the relcache entry at
+ * transaction commit, where relations that were dropped in the transaction
+ * are unlinked.
+ *
+ * If smgrwrite() is called and smgr_relpersistence == 0, we check if the
+ * relation file exists locally or not. If it does exist, we assume it's an
+ * unlogged relation and write the page there. Otherwise it must be a
+ * permanent relation, WAL-logged and stored on the page server, and we ignore
+ * the write like we do for permanent relations.
+ *
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -14,15 +45,18 @@
  */
 #include "postgres.h"
 
+#include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
+#include "catalog/pg_class.h"
 #include "pagestore_client.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 #include "access/xlogdefs.h"
 #include "postmaster/interrupt.h"
 #include "storage/bufmgr.h"
+#include "storage/md.h"
 #include "fmgr.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -40,7 +74,6 @@
 #ifdef DEBUG_COMPARE_LOCAL
 #include "access/nbtree.h"
 #include "storage/bufpage.h"
-#include "storage/md.h"
 #include "access/xlog_internal.h"
 
 static char *hexdump_page(char *page);
@@ -59,6 +92,18 @@ char	   *zenith_timeline;
 char	   *zenith_tenant;
 bool		wal_redo = false;
 
+/* unlogged relation build states */
+typedef enum
+{
+	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
+	UNLOGGED_BUILD_PHASE_1,
+	UNLOGGED_BUILD_PHASE_2,
+	UNLOGGED_BUILD_NOT_PERMANENT
+} UnloggedBuildPhase;
+
+static SMgrRelation unlogged_build_rel = NULL;
+static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+
 StringInfoData
 zm_pack_request(ZenithRequest *msg)
 {
@@ -328,13 +373,22 @@ log_newpage_copy(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 {
 	PGAlignedBlock copied_buffer;
 
-	/* set the flag in the original page, like log_newpage() does. */
-	((PageHeader) page)->pd_flags |= PD_WAL_LOGGED;
-
 	memcpy(copied_buffer.data, page, BLCKSZ);
 	return log_newpage(rnode, forkNum, blkno, copied_buffer.data, page_std);
 }
 
+/*
+ * Is 'buffer' identical to a freshly initialized empty heap page?
+ */
+static bool
+PageIsEmptyHeapPage(char *buffer)
+{
+	PGAlignedBlock empty_page;
+
+	PageInit((Page) empty_page.data, BLCKSZ, 0);
+
+	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
+}
 
 static void
 zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
@@ -345,18 +399,11 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		return;
 
 	/*
-	 * If the page was not WAL-logged before eviction then we can lose its
-	 * modification. PD_WAL_LOGGED bit is used to mark pages which are
-	 * wal-logged.
-	 *
-	 * See also comments to PD_WAL_LOGGED.
-	 *
-	 * FIXME: GIN/GiST/SP-GiST index build will scan and WAL-log again the
-	 * whole index. That's duplicative with the WAL-logging that we do here.
-	 * See log_newpage_range() calls.
-	 *
-	 * FIXME: Redoing this record will set the LSN on the page. That could
-	 * mess up the LSN-NSN interlock in GiST index build.
+	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
+	 * changes are not WAL-logged when the changes are made, so this is our
+	 * last chance to log them, otherwise they're lost. That's OK for
+	 * correctness, the non-logged updates are not critical. But we want to
+	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
 	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
 	{
@@ -366,12 +413,13 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
 		XLogFlush(recptr);
 		lsn = recptr;
-		elog(SmgrTrace, "FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		ereport(SmgrTrace,
+				(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
 	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
 	{
@@ -388,77 +436,83 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		XLogFlush(recptr);
 		lsn = recptr;
 
-		elog(SmgrTrace, "Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		ereport(SmgrTrace,
+				(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-	else if (!(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED)
-			 && !RecoveryInProgress())
+	else if (lsn == InvalidXLogRecPtr)
 	{
-		XLogRecPtr	recptr;
-
 		/*
-		 * We assume standard page layout here.
+		 * When PostgreSQL extends a relation, it calls smgrextend() with an all-zeros pages,
+		 * and we can just ignore that in Zenith. We do need to remember the new size,
+		 * though, so that smgrnblocks() returns the right answer after the rel has
+		 * been extended. We rely on the relsize cache for that.
 		 *
-		 * But at smgr level we don't really know what kind of a page this is.
-		 * We have filtered visibility map pages and fsm pages above. TODO Do
-		 * we have any special page types?
-		 */
-
-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, true);
-
-		/*
-		 * If we wal-log hint bits, someone could concurrently update page and
-		 * reset PD_WAL_LOGGED again, so this assert is not relevant anymore.
+		 * A completely empty heap page doesn't need to be WAL-logged, either. The
+		 * heapam can leave such a page behind, if e.g. an insert errors out after
+		 * initializing the page, but before it has inserted the tuple and WAL-logged
+		 * the change. When we read the page from the page server, it will come back
+		 * as all-zeros. That's OK, the heapam will initialize an all-zeros page on
+		 * first use.
 		 *
-		 * See comment to FlushBuffer(). The caller must hold a pin on the
-		 * buffer and have share-locked the buffer contents.  (Note: a
-		 * share-lock does not prevent updates of hint bits in the buffer, so
-		 * the page could change while the write is in progress, but we assume
-		 * that that will not invalidate the data written.)
+		 * In other scenarios, evicting a dirty page with no LSN is a bad sign: it implies
+		 * that the page was not WAL-logged, and its contents will be lost when it's
+		 * evicted.
 		 */
-		Assert(((PageHeader) buffer)->pd_flags & PD_WAL_LOGGED);	/* Should be set by
-																	 * log_newpage */
-
-		/*
-		 * Need to flush it too, so that it gets sent to the Page Server
-		 * before we might need to read it back. It should get flushed
-		 * eventually anyway, at least if there is some other WAL activity, so
-		 * this isn't strictly necessary for correctness. But if there is no
-		 * other WAL activity, the page read might get stuck waiting for the
-		 * record to be streamed out for an indefinite time.
-		 *
-		 * FIXME: Flushing the WAL is expensive. We should track the last
-		 * "evicted" LSN instead, and update it here. Or just kick the
-		 * bgwriter to do the flush, there is no need for us to block here
-		 * waiting for it to finish.
-		 */
-		XLogFlush(recptr);
-		lsn = recptr;
-		elog(SmgrTrace, "Force wal logging of page %u of relation %u/%u/%u.%u, lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		if (PageIsNew(buffer))
+		{
+			ereport(SmgrTrace,
+					(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+		else if (PageIsEmptyHeapPage(buffer))
+		{
+			ereport(SmgrTrace,
+					(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
+		else
+		{
+			ereport(PANIC,
+					(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
+							blocknum,
+							reln->smgr_rnode.node.spcNode,
+							reln->smgr_rnode.node.dbNode,
+							reln->smgr_rnode.node.relNode,
+							forknum)));
+		}
 	}
 	else
 	{
-		elog(SmgrTrace, "Page %u of relation %u/%u/%u.%u is alread wal logged at lsn=%X",
-			 blocknum,
-			 reln->smgr_rnode.node.spcNode,
-			 reln->smgr_rnode.node.dbNode,
-			 reln->smgr_rnode.node.relNode,
-			 forknum, (uint32) lsn);
+		ereport(SmgrTrace,
+				(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+						blocknum,
+						reln->smgr_rnode.node.spcNode,
+						reln->smgr_rnode.node.dbNode,
+						reln->smgr_rnode.node.relNode,
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
+
+	/*
+	 * Remember the LSN on this page. When we read the page again, we must
+	 * read the same or newer version of it.
+	 */
 	SetLastWrittenPageLSN(lsn);
 }
 
 
-
 /*
  *	zenith_init() -- Initialize private state
  */
@@ -568,6 +622,29 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/*
+			 * We don't know if it's an unlogged rel stored locally, or permanent
+			 * rel stored in the page server. First check if it exists locally.
+			 * If it does, great. Otherwise check if it exists in the page server.
+			 */
+			if (mdexists(reln, forkNum))
+				return true;
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdexists(reln, forkNum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -615,6 +692,23 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 void
 zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdcreate(reln, forkNum, isRedo);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	elog(SmgrTrace, "Create relation %u/%u/%u.%u",
 		 reln->smgr_rnode.node.spcNode,
 		 reln->smgr_rnode.node.dbNode,
@@ -648,9 +742,13 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 void
 zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
-#ifdef DEBUG_COMPARE_LOCAL
+	/*
+	 * Might or might not exist locally, depending on whether it's
+	 * an unlogged or permanent relation (or if DEBUG_COMPARE_LOCAL is
+	 * set). Try to unlink, it won't do any harm if the file doesn't
+	 * exist.
+	 */
 	mdunlink(rnode, forkNum, isRedo);
-#endif
 }
 
 /*
@@ -668,7 +766,25 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 {
 	XLogRecPtr	lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdextend(reln, forkNum, blkno, buffer, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
+
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
@@ -691,13 +807,16 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 void
 zenith_open(SMgrRelation reln)
 {
+	/*
+	 * We don't have anything special to do here. Call mdopen() to let md.c
+	 * initialize itself. That's only needed for temporary or unlogged
+	 * relations, but it's dirt cheap so do it always to make sure the md
+	 * fields are initialized, for debugging purposes if nothing else.
+	 */
+	mdopen(reln);
+
 	/* no work */
 	elog(SmgrTrace, "[ZENITH_SMGR] open noop");
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdopen(reln);
-#endif
 }
 
 /*
@@ -706,13 +825,11 @@ zenith_open(SMgrRelation reln)
 void
 zenith_close(SMgrRelation reln, ForkNumber forknum)
 {
-	/* no work */
-	elog(SmgrTrace, "[ZENITH_SMGR] close noop");
-
-#ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdclose(reln, forknum);
-#endif
+	/*
+	 * Let md.c close it, if it had it open. Doesn't hurt to do this
+	 * even for permanent relations that have no local storage.
+	 */
+	mdclose(reln, forknum);
 }
 
 /*
@@ -721,6 +838,23 @@ zenith_close(SMgrRelation reln, ForkNumber forknum)
 bool
 zenith_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* probably shouldn't happen, but ignore it */
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdprefetch(reln, forknum, blocknum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	/* not implemented */
 	elog(SmgrTrace, "[ZENITH_SMGR] prefetch noop");
 	return true;
@@ -736,6 +870,25 @@ void
 zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 				 BlockNumber blocknum, BlockNumber nblocks)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* mdwriteback() does nothing if the file doesn't exist */
+			mdwriteback(reln, forknum, blocknum, nblocks);
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdwriteback(reln, forknum, blocknum, nblocks);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	/* not implemented */
 	elog(SmgrTrace, "[ZENITH_SMGR] writeback noop");
 
@@ -756,6 +909,23 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithGetPageRequest request = {
@@ -796,9 +966,6 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 
 	pfree(resp);
 
-	/* Clear PD_WAL_LOGGED bit stored in WAL record */
-	((PageHeader) buffer)->pd_flags &= ~PD_WAL_LOGGED;
-
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -915,6 +1082,38 @@ zenith_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 {
 	XLogRecPtr	lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			/* This is a bit tricky. Check if the relation exists locally */
+			if (mdexists(reln, forknum))
+			{
+				/* It exists locally. Guess it's unlogged then. */
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+
+				/*
+				 * We could set relpersistence now that we have determined
+				 * that it's local. But we don't dare to do it, because that
+				 * would immediately allow reads as well, which shouldn't
+				 * happen. We could cache it with a different 'relpersistence'
+				 * value, but this isn't performance critical.
+				 */
+				return;
+			}
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	zenith_wallog_page(reln, forknum, blocknum, buffer);
 
 	lsn = PageGetLSN(buffer);
@@ -942,8 +1141,32 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			return mdnblocks(reln, forknum);
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
+	{
+		elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum, n_blocks);
 		return n_blocks;
+	}
 
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
@@ -1002,6 +1225,24 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
 	XLogRecPtr	lsn;
 
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdtruncate(reln, forknum, nblocks);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
 
 	/*
@@ -1044,6 +1285,24 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 void
 zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdimmedsync(reln, forknum);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
 	elog(SmgrTrace, "[ZENITH_SMGR] immedsync noop");
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1052,6 +1311,178 @@ zenith_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }
 
+/*
+ * zenith_start_unlogged_build() -- Starting build operation on a rel.
+ *
+ * Some indexes are built in two phases, by first populating the table with
+ * regular inserts, using the shared buffer cache but skipping WAL-logging,
+ * and WAL-logging the whole relation after it's done. Zenith relies on the
+ * WAL to reconstruct pages, so we cannot use the page server in the
+ * first phase when the changes are not logged.
+ */
+static void
+zenith_start_unlogged_build(SMgrRelation reln)
+{
+	/*
+	 * Currently, there can be only one unlogged relation build operation in
+	 * progress at a time. That's enough for the current usage.
+	 */
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+		elog(ERROR, "unlogged relation build is already in progress");
+	Assert(unlogged_build_rel == NULL);
+
+	ereport(SmgrTrace,
+			(errmsg("starting unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			unlogged_build_rel = reln;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
+		elog(ERROR, "cannot perform unlogged index build, index is not empty ");
+
+	unlogged_build_rel = reln;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
+
+	/* Make the relation look like it's unlogged */
+	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
+
+	/*
+	 * FIXME: should we pass isRedo true to create the tablespace dir if it
+	 * doesn't exist? Is it needed?
+	 */
+	mdcreate(reln, MAIN_FORKNUM, false);
+}
+
+/*
+ * zenith_finish_unlogged_build_phase_1()
+ *
+ * Call this after you have finished populating a relation in unlogged mode,
+ * before you start WAL-logging it.
+ */
+static void
+zenith_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
+		return;
+
+	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
+	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+}
+
+/*
+ * zenith_end_unlogged_build() -- Finish an unlogged rel build.
+ *
+ * Call this after you have finished WAL-logging an relation that was
+ * first populated without WAL-logging.
+ *
+ * This removes the local copy of the rel, since it's now been fully
+ * WAL-logged and is present in the page server.
+ */
+static void
+zenith_end_unlogged_build(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg("ending unlogged build of relation %u/%u/%u",
+					reln->smgr_rnode.node.spcNode,
+					reln->smgr_rnode.node.dbNode,
+					reln->smgr_rnode.node.relNode)));
+
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
+	{
+		RelFileNodeBackend rnode;
+
+		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
+		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+		/* Make the relation look permanent again */
+		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
+
+		/* Remove local copy */
+		rnode = reln->smgr_rnode;
+		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
+				 rnode.node.spcNode,
+				 rnode.node.dbNode,
+				 rnode.node.relNode,
+				 forknum);
+
+			forget_cached_relsize(rnode.node, forknum);
+			mdclose(reln, forknum);
+			/* use isRedo == true, so that we drop it immediately */
+			mdunlink(rnode, forknum, true);
+		}
+	}
+
+	unlogged_build_rel = NULL;
+	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+}
+
+static void
+AtEOXact_zenith(XactEvent event, void *arg)
+{
+	switch (event)
+	{
+		case XACT_EVENT_ABORT:
+		case XACT_EVENT_PARALLEL_ABORT:
+
+			/*
+			 * Forget about any build we might have had in progress. The local
+			 * file will be unlinked by smgrDoPendingDeletes()
+			 */
+			unlogged_build_rel = NULL;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+			break;
+
+		case XACT_EVENT_COMMIT:
+		case XACT_EVENT_PARALLEL_COMMIT:
+		case XACT_EVENT_PREPARE:
+		case XACT_EVENT_PRE_COMMIT:
+		case XACT_EVENT_PARALLEL_PRE_COMMIT:
+		case XACT_EVENT_PRE_PREPARE:
+			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+			{
+				unlogged_build_rel = NULL;
+				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 (errmsg("unlogged index build was not properly finished"))));
+			}
+			break;
+	}
+}
+
 static const struct f_smgr zenith_smgr =
 {
 	.smgr_init = zenith_init,
@@ -1069,6 +1500,10 @@ static const struct f_smgr zenith_smgr =
 	.smgr_nblocks = zenith_nblocks,
 	.smgr_truncate = zenith_truncate,
 	.smgr_immedsync = zenith_immedsync,
+
+	.smgr_start_unlogged_build = zenith_start_unlogged_build,
+	.smgr_finish_unlogged_build_phase_1 = zenith_finish_unlogged_build_phase_1,
+	.smgr_end_unlogged_build = zenith_end_unlogged_build,
 };
 
 
@@ -1086,6 +1521,8 @@ smgr_zenith(BackendId backend, RelFileNode rnode)
 void
 smgr_init_zenith(void)
 {
+	RegisterXactCallback(AtEOXact_zenith, NULL);
+
 	smgr_init_standard();
 	zenith_init();
 }
diff --git a/contrib/zenith/relsize_cache.c b/contrib/zenith/relsize_cache.c
index 0ba99a128f9..993903b1b18 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/zenith/relsize_cache.c
@@ -127,6 +127,21 @@ update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
 	}
 }
 
+void
+forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
+{
+	if (relsize_hash_size > 0)
+	{
+		RelTag		tag;
+
+		tag.rnode = rnode;
+		tag.forknum = forknum;
+		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
+		hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
+		LWLockRelease(relsize_lock);
+	}
+}
+
 void
 relsize_hash_init(void)
 {
diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index e042cfdef92..003a0befb25 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -54,8 +54,6 @@ mask_page_hint_bits(Page page)
 	PageClearFull(page);
 	PageClearHasFreeLinePointers(page);
 
-	phdr->pd_flags &= ~PD_WAL_LOGGED;
-
 	/*
 	 * During replay, if the page LSN has advanced past our XLOG record's LSN,
 	 * we don't mark the page all-visible. See heap_xlog_visible() for
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 0e8672c9e90..dfad28d1f61 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -335,6 +335,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	smgr_start_unlogged_build(index->rd_smgr);
+
 	initGinState(&buildstate.ginstate, index);
 	buildstate.indtuples = 0;
 	memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
@@ -408,6 +410,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
 	ginUpdateStats(index, &buildstate.buildStats, true);
 
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 	/*
 	 * We didn't write WAL records as we built the index, so if WAL-logging is
 	 * required, write all pages to the WAL now.
@@ -418,6 +422,9 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
+	SetLastWrittenPageLSN(XactLastRecEnd);
+
+	smgr_end_unlogged_build(index->rd_smgr);
 
 	/*
 	 * Return statistics
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index f46a42197c9..aef96c91da0 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -40,6 +40,7 @@
 #include "access/tableam.h"
 #include "access/xloginsert.h"
 #include "catalog/index.h"
+#include "catalog/storage.h"
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
@@ -289,6 +290,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		Buffer		buffer;
 		Page		page;
 
+		smgr_start_unlogged_build(index->rd_smgr);
+
 		/* initialize the root page */
 		buffer = gistNewBuffer(index);
 		Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
@@ -321,6 +324,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			gistFreeBuildBuffers(buildstate.gfbb);
 		}
 
+		smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 		/*
 		 * We didn't write WAL records as we built the index, so if
 		 * WAL-logging is required, write all pages to the WAL now.
@@ -331,6 +336,9 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
+		SetLastWrittenPageLSN(XactLastRecEnd);
+
+		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
 	/* okay, all heap tuples are indexed */
@@ -456,8 +464,13 @@ gist_indexsortbuild(GISTBuildState *state)
 	smgrwrite(state->indexrel->rd_smgr, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 			  pagestate->page, true);
 	if (RelationNeedsWAL(state->indexrel))
-		log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
+	{
+		XLogRecPtr lsn;
+
+		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
+		SetLastWrittenPageLSN(lsn);
+	}
 
 	pfree(pagestate->page);
 	pfree(pagestate);
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 1a1bb4a53f6..43ba03b6eb9 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -866,8 +866,6 @@ gistNewBuffer(Relation r)
 				if (XLogStandbyInfoActive() && RelationNeedsWAL(r))
 					gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page));
 
-				((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
-
 				return buffer;
 			}
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index d1192e6a0c5..5795497051f 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -628,7 +628,7 @@ heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(*newrnode, rel->rd_backend);
+	dstrel = smgropen(*newrnode, rel->rd_backend, rel->rd_rel->relpersistence);
 	RelationOpenSmgr(rel);
 
 	/*
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 1af0af7da21..d85dd54e4df 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -85,6 +85,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		elog(ERROR, "index \"%s\" already contains data",
 			 RelationGetRelationName(index));
 
+	smgr_start_unlogged_build(index->rd_smgr);
+
 	/*
 	 * Initialize the meta page and root pages
 	 */
@@ -105,7 +107,6 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
 	MarkBufferDirty(nullbuffer);
 
-
 	END_CRIT_SECTION();
 
 	UnlockReleaseBuffer(metabuffer);
@@ -131,6 +132,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 
 	SpGistUpdateMetaPage(index);
 
+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+
 	/*
 	 * We didn't write WAL records as we built the index, so if WAL-logging is
 	 * required, write all pages to the WAL now.
@@ -141,6 +144,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
+	SetLastWrittenPageLSN(XactLastRecEnd);
+
+	smgr_end_unlogged_build(index->rd_smgr);
 
 	result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
 	result->heap_tuples = reltuples;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 4622325901b..b153fad594d 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -239,7 +239,6 @@ XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
-	((PageHeader)regbuf->page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -295,7 +294,6 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
 	regbuf->flags = flags;
 	regbuf->rdata_tail = (XLogRecData *) &regbuf->rdata_head;
 	regbuf->rdata_len = 0;
-	((PageHeader)page)->pd_flags |= PD_WAL_LOGGED;
 
 	/*
 	 * Check that this page hasn't already been registered with some other
@@ -1183,18 +1181,7 @@ log_newpage_range(Relation rel, ForkNumber forkNum,
 			MarkBufferDirty(bufpack[i]);
 		}
 
-		/*
-		 * Zenith forces WAL logging of evicted pages,
-		 * so it can happen that in some cases when pages are first
-		 * modified and then WAL logged (for example building GiST/GiN
-		 * indexes) there are no more pages which need to be WAL logged at
-		 * the end of build procedure. As far as XLogInsert throws error
-		 * if not records were inserted, we need to reset the insert state.
-		 */
-		if (nbufs > 0)
-			recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
-		else
-			XLogResetInsertion();
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI);
 
 		for (i = 0; i < nbufs; i++)
 		{
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index baf4dbed4aa..c5d03cd4b83 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -463,7 +463,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	Assert(blkno != P_NEW);
 
 	/* Open the relation at smgr level */
-	smgr = smgropen(rnode, InvalidBackendId);
+	smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 	/*
 	 * Create the target file if it doesn't already exist.  This lets us cope
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index cba7a9ada07..8ab9c44d05b 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -143,7 +143,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
 			return NULL;		/* placate compiler */
 	}
 
-	srel = smgropen(rnode, backend);
+	srel = smgropen(rnode, backend, relpersistence);
 	smgrcreate(srel, MAIN_FORKNUM, false);
 
 	if (needs_wal)
@@ -624,7 +624,7 @@ smgrDoPendingDeletes(bool isCommit)
 			{
 				SMgrRelation srel;
 
-				srel = smgropen(pending->relnode, pending->backend);
+				srel = smgropen(pending->relnode, pending->backend, 0);
 
 				/* allocate the initial array, or extend it, if needed */
 				if (maxrels == 0)
@@ -705,7 +705,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
 		BlockNumber total_blocks = 0;
 		SMgrRelation srel;
 
-		srel = smgropen(pendingsync->rnode, InvalidBackendId);
+		srel = smgropen(pendingsync->rnode, InvalidBackendId, 0);
 
 		/*
 		 * We emit newpage WAL records for smaller relations.
@@ -914,7 +914,7 @@ smgr_redo(XLogReaderState *record)
 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
 		SMgrRelation reln;
 
-		reln = smgropen(xlrec->rnode, InvalidBackendId);
+		reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 		smgrcreate(reln, xlrec->forkNum, true);
 	}
 	else if (info == XLOG_SMGR_TRUNCATE)
@@ -927,7 +927,7 @@ smgr_redo(XLogReaderState *record)
 		int			nforks = 0;
 		bool		need_fsm_vacuum = false;
 
-		reln = smgropen(xlrec->rnode, InvalidBackendId);
+		reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 		/*
 		 * Forcibly create relation if it doesn't exist (which suggests that
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 5bb98b6d116..a91e0d828f6 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -693,12 +693,6 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("cannot create temporary table within security-restricted operation")));
 
-	if (stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED)
-	{
-		/* Unlogged tables are not supported by Zenith */
-		stmt->relation->relpersistence = RELPERSISTENCE_PERMANENT;
-	}
-
 	/*
 	 * Determine the lockmode to use when scanning parents.  A self-exclusive
 	 * lock is needed here.
@@ -14102,7 +14096,7 @@ index_copy_data(Relation rel, RelFileNode newrnode)
 {
 	SMgrRelation dstrel;
 
-	dstrel = smgropen(newrnode, rel->rd_backend);
+	dstrel = smgropen(newrnode, rel->rd_backend, rel->rd_rel->relpersistence);
 	RelationOpenSmgr(rel);
 
 	/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1daff7125b4..2ec50f6d66e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -792,7 +792,7 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 {
 	bool		hit;
 
-	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 
 	Assert(InRecovery);
 
@@ -1621,11 +1621,6 @@ MarkBufferDirty(Buffer buffer)
 		if (VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
 	}
-	/*
-	 * Clear PD_WAL_LOGGED flag so that if dirty page is evicted from page pool
-	 * before been WAL logged, FPI WAL record will be enforced.
-	 */
-	((PageHeader)BufferGetPage(buffer))->pd_flags &= ~PD_WAL_LOGGED;
 }
 
 /*
@@ -2041,15 +2036,6 @@ BufferSync(int flags)
 			item->blockNum = bufHdr->tag.blockNum;
 		}
 
-		/* Zenith XXX
-		 * Consider marking this page as not WAL-logged,
-		 * so that pagestore_smgr issued a log record before eviction
-		 * and persisted hint changes.
-		 * TODO: check performance impacts of this approach
-		 * since extra wal-logging may worsen the performance.
-		 */
-		//((PageHeader)page)->pd_flags &= ~PD_WAL_LOGGED;
-
 		UnlockBufHdr(bufHdr, buf_state);
 
 		/* Check for barrier events in case NBuffers is large. */
@@ -2896,7 +2882,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
 
 	/* Find smgr relation for buffer */
 	if (reln == NULL)
-		reln = smgropen(buf->tag.rnode, InvalidBackendId);
+		reln = smgropen(buf->tag.rnode, InvalidBackendId, 0);
 
 	TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
 										buf->tag.blockNum,
@@ -4901,7 +4887,7 @@ IssuePendingWritebacks(WritebackContext *context)
 		i += ahead;
 
 		/* and finally tell the kernel to write the data to storage */
-		reln = smgropen(tag.rnode, InvalidBackendId);
+		reln = smgropen(tag.rnode, InvalidBackendId, 0);
 		smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
 	}
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 04b3558ea33..b9811cc7327 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -215,7 +215,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 		/* Find smgr relation for buffer */
-		oreln = smgropen(bufHdr->tag.rnode, MyBackendId);
+		oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP);
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 48dc7bde265..3616846ad07 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -427,7 +427,6 @@ PageRestoreTempPage(Page tempPage, Page oldPage)
 
 	pageSize = PageGetPageSize(tempPage);
 	memcpy((char *) oldPage, (char *) tempPage, pageSize);
-	((PageHeader)oldPage)->pd_flags &= ~PD_WAL_LOGGED;
 	pfree(tempPage);
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index b4bca7eed6f..574b125718f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1055,7 +1055,7 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
 	srels = palloc(sizeof(SMgrRelation) * ndelrels);
 	for (i = 0; i < ndelrels; i++)
 	{
-		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId, 0);
 
 		if (isRedo)
 		{
@@ -1333,7 +1333,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 int
 mdsyncfiletag(const FileTag *ftag, char *path)
 {
-	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+	SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0);
 	File		file;
 	bool		need_to_close;
 	int			result,
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 8d2b6b73b29..10a6f65c118 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -135,9 +135,14 @@ smgr(BackendId backend, RelFileNode rnode)
  *	smgropen() -- Return an SMgrRelation object, creating it if need be.
  *
  *		This does not attempt to actually open the underlying file.
+ *
+ * The caller should pass the value of pg_class.relpersistence, if they know
+ * it, or 0 if unknown. Some operations, like smgrwrite() and smgrunlink()
+ * are allowed when relpersistence is not known, but others like smgrread()
+ * require it.
  */
 SMgrRelation
-smgropen(RelFileNode rnode, BackendId backend)
+smgropen(RelFileNode rnode, BackendId backend, char relpersistence)
 {
 	RelFileNodeBackend brnode;
 	SMgrRelation reln;
@@ -168,6 +173,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 		/* hash_search already filled in the lookup key */
 		reln->smgr_owner = NULL;
 		reln->smgr_targblock = InvalidBlockNumber;
+		reln->smgr_relpersistence = relpersistence;
 		for (int i = 0; i <= MAX_FORKNUM; ++i)
 			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
 
@@ -179,6 +185,17 @@ smgropen(RelFileNode rnode, BackendId backend)
 		/* it has no owner yet */
 		dlist_push_tail(&unowned_relns, &reln->node);
 	}
+	else
+	{
+		/*
+		 * If the caller passed a valid 'relpersistence', and it was unknown
+		 * before, update it.
+		 */
+		if (reln->smgr_relpersistence == 0)
+			reln->smgr_relpersistence = relpersistence;
+		else
+			Assert(relpersistence == 0 || reln->smgr_relpersistence == relpersistence);
+	}
 
 	return reln;
 }
@@ -652,6 +669,30 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 	(*reln->smgr).smgr_immedsync(reln, forknum);
 }
 
+/*
+ * Zenith-added functions to mark the phases of an unlogged index build.
+ */
+void
+smgr_start_unlogged_build(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_start_unlogged_build)
+		(*reln->smgr).smgr_start_unlogged_build(reln);
+}
+
+void
+smgr_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_finish_unlogged_build_phase_1)
+		(*reln->smgr).smgr_finish_unlogged_build_phase_1(reln);
+}
+
+void
+smgr_end_unlogged_build(SMgrRelation reln)
+{
+	if ((*reln->smgr).smgr_end_unlogged_build)
+		(*reln->smgr).smgr_end_unlogged_build(reln);
+}
+
 /*
  * AtEOXact_SMgr
  *
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index a02592fc0c2..9e1620922ce 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -74,6 +74,7 @@
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
+#include "catalog/pg_class.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
@@ -530,7 +531,7 @@ BeginRedoForBlock(StringInfo input_message)
 
 	MemoryContextSwitchTo(oldcxt);
 
-	reln = smgropen(rnode, InvalidBackendId);
+	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
 		reln->smgr_cached_nblocks[forknum] < blknum + 1)
 	{
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index ade36f28be5..33474e01941 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -271,12 +271,15 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS)
  * is no check here or at the call sites for that.
  */
 static int64
-calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum)
+calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum, char relpersistence)
 {
-	SMgrRelation  srel = smgropen(*rfn, backend);
-	if (smgrexists(srel, forknum))	{
+	SMgrRelation  srel = smgropen(*rfn, backend, relpersistence);
+
+	if (smgrexists(srel, forknum))
+	{
 		BlockNumber n = smgrnblocks(srel, forknum);
-		return (int64)n*BLCKSZ;
+
+		return (int64) n * BLCKSZ;
 	}
 	return 0;
 }
@@ -302,7 +305,8 @@ pg_relation_size(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 
 	size = calculate_relation_size(&(rel->rd_node), rel->rd_backend,
-								   forkname_to_number(text_to_cstring(forkName)));
+								   forkname_to_number(text_to_cstring(forkName)),
+								   rel->rd_rel->relpersistence);
 
 	relation_close(rel, AccessShareLock);
 
@@ -327,7 +331,8 @@ calculate_toast_table_size(Oid toastrelid)
 	/* toast heap size, including FSM and VM size */
 	for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 		size += calculate_relation_size(&(toastRel->rd_node),
-										toastRel->rd_backend, forkNum);
+										toastRel->rd_backend, forkNum,
+										toastRel->rd_rel->relpersistence);
 
 	/* toast index size, including FSM and VM size */
 	indexlist = RelationGetIndexList(toastRel);
@@ -341,7 +346,8 @@ calculate_toast_table_size(Oid toastrelid)
 									AccessShareLock);
 		for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 			size += calculate_relation_size(&(toastIdxRel->rd_node),
-											toastIdxRel->rd_backend, forkNum);
+											toastIdxRel->rd_backend, forkNum,
+											toastIdxRel->rd_rel->relpersistence);
 
 		relation_close(toastIdxRel, AccessShareLock);
 	}
@@ -370,7 +376,8 @@ calculate_table_size(Relation rel)
 	 */
 	for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 		size += calculate_relation_size(&(rel->rd_node), rel->rd_backend,
-										forkNum);
+										forkNum,
+										rel->rd_rel->relpersistence);
 
 	/*
 	 * Size of toast relation
@@ -410,7 +417,8 @@ calculate_indexes_size(Relation rel)
 			for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
 				size += calculate_relation_size(&(idxRel->rd_node),
 												idxRel->rd_backend,
-												forkNum);
+												forkNum,
+												idxRel->rd_rel->relpersistence);
 
 			relation_close(idxRel, AccessShareLock);
 		}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 6d140786c74..40fcdf6d871 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -227,6 +227,8 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
 
+extern void MarkBufferPermanent(Buffer buffer);
+
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 extern bool ConditionalLockBuffer(Buffer buffer);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 6704f69f328..c86ccdaf608 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -182,24 +182,7 @@ typedef PageHeaderData *PageHeader;
 #define PD_ALL_VISIBLE		0x0004	/* all tuples on page are visible to
 									 * everyone */
 
-/* Zenith XXX:
- * Some operations in PostgreSQL are not WAL-logged at all (i.e. hint bits)
- * or delay wal-logging till the end of operation (i.e. index build).
- *
- * So if such page is evicted, we will lose the update.
- * To fix it, we introduce PD_WAL_LOGGED bit to track whether the page was wal-logged.
- * If page is evicted before it has been wal-logged, then pagestore_smgr creates FPI for it.
- *
- * List of such operations:
- * - GIN/GiST/SP-GiST index build
- * - page and heaptuple hint bits
- * - Clearing visibility map bits
- * - FSM changes
- * - ???
- */
-#define PD_WAL_LOGGED       0x0008  /* Page is wal-logged */
-#define PD_VALID_FLAG_BITS	0x000F	/* OR of all valid pd_flags bits */
-
+#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a7c98c7e7fe..c08eaed6179 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -43,6 +43,9 @@ typedef struct SMgrRelationData
 	/* rnode is the hashtable lookup key, so it must be first! */
 	RelFileNodeBackend smgr_rnode;	/* relation physical identifier */
 
+	/* copy of pg_class.relpersistence, or 0 if not known */
+	char		smgr_relpersistence;
+
 	/* pointer to owning pointer, or NULL if none */
 	struct SMgrRelationData **smgr_owner;
 
@@ -115,6 +118,10 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+
+	void		(*smgr_start_unlogged_build) (SMgrRelation reln);
+	void		(*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln);
+	void		(*smgr_end_unlogged_build) (SMgrRelation reln);
 } f_smgr;
 
 typedef void (*smgr_init_hook_type) (void);
@@ -132,7 +139,7 @@ extern const f_smgr *smgr_standard(BackendId backend, RelFileNode rnode);
 extern const f_smgr *smgr(BackendId backend, RelFileNode rnode);
 
 extern void smgrinit(void);
-extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
+extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend, char relpersistence);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
 extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
 extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln);
@@ -159,4 +166,8 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void AtEOXact_SMgr(void);
 
+extern void smgr_start_unlogged_build(SMgrRelation reln);
+extern void	smgr_finish_unlogged_build_phase_1(SMgrRelation reln);
+extern void smgr_end_unlogged_build(SMgrRelation reln);
+
 #endif							/* SMGR_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 77d176a9348..df30a5729cf 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -534,7 +534,8 @@ typedef struct ViewOptions
 #define RelationOpenSmgr(relation) \
 	do { \
 		if ((relation)->rd_smgr == NULL) \
-			smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \
+			smgrsetowner(&((relation)->rd_smgr), \
+						 smgropen((relation)->rd_node, (relation)->rd_backend, (relation)->rd_rel->relpersistence)); \
 	} while (0)
 
 /*

From 7a6caaa055980a7740a71bd7c9f7b0188302ba24 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 14 Oct 2021 15:03:14 +0300
Subject: [PATCH 066/167] Fix queue cleanup in proposer (#93)

Queue was moved further than truncateLsn, when quorumLsn matched end of wal record in the middle of queue message. Fix cleanup of unreceived messages.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
---
 src/backend/replication/walproposer.c | 39 +++++++++++++++++++--------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 35624a77352..526602e953b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -77,8 +77,8 @@ static ProposerGreeting proposerGreeting;
 static WaitEventSet *waitEvents;
 static AppendResponse lastFeedback;
 /*
- *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
- *  + 1 of last chunk streamed to everyone)
+ *  minimal LSN which may be needed for recovery of some safekeeper,
+ *  record-aligned (first record which might not yet received by someone).
  */
 static XLogRecPtr truncateLsn;
 static XLogRecPtr candidateTruncateLsn;
@@ -360,14 +360,10 @@ HandleWalKeeperResponse(void)
 									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 
-
-	/* Cleanup message queue */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1))
+	/* Advance truncateLsn */
+	WalMessage *msgQueueAck = msgQueueHead;
+	while (msgQueueAck != NULL && msgQueueAck->ackMask == ((1 << n_walkeepers) - 1))
 	{
-		WalMessage *msg = msgQueueHead;
-
-		msgQueueHead = msg->next;
-
 		/*
 		 * This piece is received by everyone; try to advance truncateLsn, but
 		 * hold it back to nearest commitLsn. Thus we will always start
@@ -383,22 +379,39 @@ HandleWalKeeperResponse(void)
 		 * read from WAL and send are plain sheets of bytes, but safekeepers
 		 * ack only on commit boundaries.
 		 */
-		if (msg->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
+		if (msgQueueAck->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
 		{
 			truncateLsn = minQuorumLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
-		else if (msg->req.endLsn >= candidateTruncateLsn &&
+		else if (msgQueueAck->req.endLsn >= candidateTruncateLsn &&
 				 candidateTruncateLsn != InvalidXLogRecPtr)
 		{
 			truncateLsn = candidateTruncateLsn;
 			candidateTruncateLsn = InvalidXLogRecPtr;
 		}
+
+		msgQueueAck = msgQueueAck->next;
+	}
+
+	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	{
+		WalMessage *msg = msgQueueHead;
+		msgQueueHead = msg->next;
+
 		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
 		free(msg);
 	}
 	if (!msgQueueHead)			/* queue is empty */
 		msgQueueTail = NULL;
+	/* truncateLsn always points to the first chunk in the queue */
+	if (msgQueueHead)
+	{
+		/* Max takes care of special 0-sized messages */
+		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
+			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
+	}
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
@@ -1370,6 +1383,10 @@ AdvancePollState(int i, uint32 events)
 
 				if (++n_votes != quorum)
 				{
+					/* Can't start streaming earlier than truncateLsn */
+					wk->startStreamingAt = truncateLsn;
+					Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+
 					/*
 					 * We are already streaming WAL: send all pending messages
 					 * to the attached walkeeper

From f38fd05e239467a433ebe1308ee64dace48466f0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 19 Oct 2021 09:47:32 +0300
Subject: [PATCH 067/167] Support read-only nodes

This changes the format of the 'zenith.signal' file. It is now a
human-readable text file, with one line like "PREV LSN: 0/1234568", or
"PREV LSN: none" if the prev LSN is not known, or "PREV LSN: invalid" if
starting up in read-write is not allowed.

Also, if 'zenith.signal' is present, don't try to read the checkpoint
record from the WAL. Trust the copy in pg_control, instead.
---
 contrib/zenith/pagestore_smgr.c       |   9 +-
 src/backend/access/transam/xlog.c     | 193 ++++++++++++++++++++------
 src/include/access/xlog.h             |   2 +
 src/include/replication/walproposer.h |   2 +-
 4 files changed, 156 insertions(+), 50 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 99914365428..cfb1068e122 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -560,13 +560,14 @@ zenith_get_request_lsn(bool *latest)
 
 	if (RecoveryInProgress())
 	{
+		*latest = false;
 		lsn = GetXLogReplayRecPtr(NULL);
 		elog(DEBUG1, "zenith_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
-		lsn = InvalidXLogRecPtr;
 	}
 	else if (am_walsender)
 	{
+		*latest = true;
 		lsn = InvalidXLogRecPtr;
 		elog(DEBUG1, "am walsender zenith_get_request_lsn lsn 0 ");
 	}
@@ -579,6 +580,7 @@ zenith_get_request_lsn(bool *latest)
 		 * pages modified by later WAL records must still in the buffer cache,
 		 * so our request cannot concern those.
 		 */
+		*latest = true;
 		lsn = GetLastWrittenPageLSN();
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
@@ -602,11 +604,6 @@ zenith_get_request_lsn(bool *latest)
 		}
 	}
 
-	/*
-	 * FIXME: In read-only mode, we would need to set *latest=false here. But
-	 * we don't support read-only mode at the moment
-	 */
-	*latest = true;
 	return lsn;
 }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index beb114c16a5..7c9bdce8dcc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -280,6 +280,13 @@ bool		InArchiveRecovery = false;
 static bool standby_signal_file_found = false;
 static bool recovery_signal_file_found = false;
 
+/*
+ * Variables read from 'zenith.signal' file.
+ */
+bool		ZenithRecoveryRequested = false;
+XLogRecPtr	zenithLastRec = InvalidXLogRecPtr;
+bool		zenithWriteOk = false;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
@@ -5539,6 +5546,81 @@ readRecoverySignalFile(void)
 				 errmsg("standby mode is not supported by single-user servers")));
 }
 
+static void
+readZenithSignalFile(void)
+{
+	int			fd;
+
+	fd = BasicOpenFile(ZENITH_SIGNAL_FILE, O_RDONLY | PG_BINARY);
+	if (fd >= 0)
+	{
+		struct stat statbuf;
+		char	   *content;
+		char		prev_lsn_str[20];
+
+		/* Slurp the file into a string */
+		if (stat(ZENITH_SIGNAL_FILE, &statbuf) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m",
+							ZENITH_SIGNAL_FILE)));
+		content = palloc(statbuf.st_size + 1);
+		if (read(fd, content, statbuf.st_size) != statbuf.st_size)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							ZENITH_SIGNAL_FILE)));
+		content[statbuf.st_size] = '\0';
+
+		/* Parse it */
+		if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE)));
+
+		if (strcmp(prev_lsn_str, "invalid") == 0)
+		{
+			/* No prev LSN. Forbid starting up in read-write mode */
+			zenithLastRec = InvalidXLogRecPtr;
+			zenithWriteOk = false;
+		}
+		else if (strcmp(prev_lsn_str, "none") == 0)
+		{
+			/*
+			 * The page server had no valid prev LSN, but assured that it's ok
+			 * to start without it. This happens when you start the compute
+			 * node for the first time on a new branch.
+			 */
+			zenithLastRec = InvalidXLogRecPtr;
+			zenithWriteOk = true;
+		}
+		else
+		{
+			uint32		hi,
+						lo;
+
+			if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE)));
+			zenithLastRec = ((uint64) hi) << 32 | lo;
+
+			/* If prev LSN is given, it better be valid */
+			if (zenithLastRec == InvalidXLogRecPtr)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid prev-LSN in file \"%s\"", ZENITH_SIGNAL_FILE)));
+			zenithWriteOk = true;
+		}
+		ZenithRecoveryRequested = true;
+		close(fd);
+
+		elog(LOG,
+			 "[ZENITH] found 'zenith.signal' file. setting prev LSN to %X/%X",
+			 LSN_FORMAT_ARGS(zenithLastRec));
+	}
+}
+
 static void
 validateRecoveryParameters(void)
 {
@@ -6495,7 +6577,6 @@ StartupXLOG(void)
 	bool		reachedRecoveryTarget = false;
 	bool		haveBackupLabel = false;
 	bool		haveTblspcMap = false;
-	bool        skipLastRecordReread = false;
 	XLogRecPtr	RecPtr,
 				checkPointLoc,
 				EndOfLog;
@@ -6520,10 +6601,15 @@ StartupXLOG(void)
 		   CurrentResourceOwner == AuxProcessResourceOwner);
 	CurrentResourceOwner = AuxProcessResourceOwner;
 
+	/*
+	 * Read zenith.signal before anything else.
+	 */
+	readZenithSignalFile();
+
 	/*
 	 * Check that contents look valid.
 	 */
-	if (!XRecOffIsValid(ControlFile->checkPoint))
+	if (!XRecOffIsValid(ControlFile->checkPoint) && !ZenithRecoveryRequested)
 		ereport(FATAL,
 				(errmsg("control file contains invalid checkpoint location")));
 
@@ -6653,6 +6739,9 @@ StartupXLOG(void)
 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
 			ereport(LOG,
 					(errmsg("starting point-in-time recovery to earliest consistent point")));
+		else if (ZenithRecoveryRequested)
+			ereport(LOG,
+					(errmsg("starting zenith recovery")));
 		else
 			ereport(LOG,
 					(errmsg("starting archive recovery")));
@@ -6783,6 +6872,29 @@ StartupXLOG(void)
 		/* set flag to delete it later */
 		haveBackupLabel = true;
 	}
+	else if (ZenithRecoveryRequested)
+	{
+		/*
+		 * Zenith hacks to spawn compute node without WAL.  Pretend that we
+		 * just finished reading the record that started at 'zenithLastRec'
+		 * and ended at checkpoint.redo
+		 */
+		elog(LOG, "starting with zenith basebackup at LSN %X/%X, prev %X/%X",
+			 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo),
+			 LSN_FORMAT_ARGS(zenithLastRec));
+
+		checkPointLoc = zenithLastRec;
+		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		EndRecPtr = ControlFile->checkPointCopy.redo;
+
+		memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint));
+		wasShutdown = true;
+
+		/* Initialize expectedTLEs, like ReadRecord() does */
+		expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID);
+
+		XLogBeginRead(xlogreader, EndRecPtr);
+	}
 	else
 	{
 		/*
@@ -7040,30 +7152,10 @@ StartupXLOG(void)
 
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 	doPageWrites = lastFullPageWrites;
-	if (RecPtr < checkPoint.redo)
-	{
-		int fd = BasicOpenFile("zenith.signal", O_RDWR | PG_BINARY);
-		if (fd >= 0) {
-			XLogRecPtr prevRecPtr = 0;
-			if ((size_t)read(fd, &prevRecPtr, sizeof prevRecPtr) != sizeof(prevRecPtr)) {
-				elog(LOG, "can't read previous record position from zenith.signal file: %m");
-			}
-			LastRec = prevRecPtr;
-			/* Zenith hacks to spawn compute node without WAL */
-			EndRecPtr = RecPtr = checkPoint.redo;
-			skipLastRecordReread = true;
-			close(fd);
 
-			elog(LOG,
-				"[ZENITH] found 'zenith.signal' file. Setting prevRecPtr to %X/%X",
-				LSN_FORMAT_ARGS(prevRecPtr));
-		}
-		else
-		{
-			ereport(PANIC,
-					(errmsg("invalid redo in checkpoint record")));
-		}
-	}
+	if (RecPtr < checkPoint.redo && !ZenithRecoveryRequested)
+		ereport(PANIC,
+				(errmsg("invalid redo in checkpoint record")));
 
 	/*
 	 * Check whether we need to force recovery from WAL.  If it appears to
@@ -7728,25 +7820,40 @@ StartupXLOG(void)
 	 * that and continue after it.  In all other cases, re-fetch the last
 	 * valid or last applied record, so we can identify the exact endpoint of
 	 * what we consider the valid portion of WAL.
+	 *
+	 * When starting from a zenith base backup, we don't have WAL. Initialize
+	 * the WAL page where we will start writing new records from scratch,
+	 * instead.
 	 */
-
-	/*
-	 * We use the last WAL page to initialize the WAL for writing,
-	 * so we better have it in memory.
-	 */
-	if (skipLastRecordReread)
+	if (ZenithRecoveryRequested)
 	{
-		int offs = (EndRecPtr % XLOG_BLCKSZ);
-		XLogRecPtr lastPage = EndRecPtr - offs;
-		int idx = XLogRecPtrToBufIdx(lastPage);
-		XLogPageHeader xlogPageHdr = (XLogPageHeader)(XLogCtl->pages + idx*XLOG_BLCKSZ);
-		xlogPageHdr->xlp_pageaddr = lastPage;
-		xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
-		xlogPageHdr->xlp_tli = ThisTimeLineID;
-		xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD;
-		xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
-		readOff = XLogSegmentOffset(lastPage, wal_segment_size);
-		elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
+		if (!zenithWriteOk)
+		{
+			/*
+			 * We cannot start generating new WAL if we don't have a valid prev-LSN
+			 * to use for the first new WAL record. (Shouldn't happen.)
+			 */
+			ereport(ERROR,
+					(errmsg("cannot start in read-write mode from this base backup")));
+		}
+		else
+		{
+			int			offs = (EndRecPtr % XLOG_BLCKSZ);
+			XLogRecPtr	lastPage = EndRecPtr - offs;
+			int			idx = XLogRecPtrToBufIdx(lastPage);
+			XLogPageHeader xlogPageHdr = (XLogPageHeader) (XLogCtl->pages + idx * XLOG_BLCKSZ);
+
+			xlogPageHdr->xlp_pageaddr = lastPage;
+			xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC;
+			xlogPageHdr->xlp_tli = ThisTimeLineID;
+			xlogPageHdr->xlp_info = XLP_FIRST_IS_CONTRECORD; // FIXME
+			xlogPageHdr->xlp_rem_len = offs - SizeOfXLogShortPHD;
+			readOff = XLogSegmentOffset(lastPage, wal_segment_size);
+
+			elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(EndRecPtr));
+
+			// FIXME: should we unlink zenith.signal?
+		}
 	}
 	else
 	{
@@ -7941,7 +8048,7 @@ StartupXLOG(void)
 		/* Copy the valid part of the last block, and zero the rest */
 		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
 		len = EndOfLog % XLOG_BLCKSZ;
-		if (!skipLastRecordReread)
+		if (!ZenithRecoveryRequested)
 			memcpy(page, xlogreader->readBuf, len);
 		memset(page + len, 0, XLOG_BLCKSZ - len);
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 8b8b14d2fd0..986eb957570 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -403,6 +403,8 @@ extern SessionBackupState get_backup_status(void);
 #define TABLESPACE_MAP			"tablespace_map"
 #define TABLESPACE_MAP_OLD		"tablespace_map.old"
 
+#define ZENITH_SIGNAL_FILE		"zenith.signal"
+
 /* files to signal promotion to primary */
 #define PROMOTE_SIGNAL_FILE		"promote"
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 222faaea41d..c6ece7a8ec7 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -263,7 +263,7 @@ struct WalMessage
 
 	/* PHANTOM FIELD:
 	 *
-	 * All WalMessages are allocated with exactly (size - sizeof(WalKeeperRequest)) additional bytes
+	 * All WalMessages are allocated with exactly (size - sizeof(AppendRequestHeader)) additional bytes
 	 * after them, containing the body of the message. This allocation is done in `CreateMessage`
 	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
 };

From 3642bebcd830bbfd34ba8500c6eed8135f0a6b35 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 21 Oct 2021 16:28:52 +0300
Subject: [PATCH 068/167] Remove a drop of syncSafekeepers complexity.

---
 src/backend/replication/walproposer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 526602e953b..7cf0414a4cc 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1426,8 +1426,8 @@ AdvancePollState(int i, uint32 events)
 
 						if (syncSafekeepers)
 						{
-							/* Wait until all walkeepers are synced */
-							WalProposerLoop();
+							/* keep polling until all walkeepers are synced */
+							return;
 						}
 					}
 					else if (syncSafekeepers)

From 4744ed7478b962ad441bfdd78674d3cc54167d52 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 21 Oct 2021 16:36:28 +0300
Subject: [PATCH 069/167] Fix compiler warning.

warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
  364 |  WalMessage *msgQueueAck = msgQueueHead;
---
 src/backend/replication/walproposer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7cf0414a4cc..5bb6322d29d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -339,6 +339,7 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
+	WalMessage *msgQueueAck;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastFeedback.flushLsn)
@@ -361,7 +362,7 @@ HandleWalKeeperResponse(void)
 	}
 
 	/* Advance truncateLsn */
-	WalMessage *msgQueueAck = msgQueueHead;
+	msgQueueAck = msgQueueHead;
 	while (msgQueueAck != NULL && msgQueueAck->ackMask == ((1 << n_walkeepers) - 1))
 	{
 		/*

From b9e8a5ce522390a2f1e58b9f1304e389ea0ae4ab Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 18 Sep 2021 17:45:56 +0300
Subject: [PATCH 070/167] Implement backpressure for compute node to avoid WAL
 overflow

---
 contrib/zenith/pagestore_smgr.c         |  5 ++-
 src/backend/access/transam/xloginsert.c | 43 +++++++++++++++++++++++
 src/backend/replication/walproposer.c   | 45 +++++++++++++++++++++++--
 src/backend/replication/walsender.c     | 29 ++++++++++++++++
 src/backend/storage/buffer/bufmgr.c     |  1 -
 src/backend/utils/activity/wait_event.c |  3 ++
 src/backend/utils/misc/guc.c            | 24 +++++++++++++
 src/include/access/xlogdefs.h           |  8 +++++
 src/include/access/xloginsert.h         |  3 ++
 src/include/replication/walproposer.h   |  2 ++
 src/include/replication/walsender.h     |  3 +-
 src/include/utils/wait_event.h          |  3 +-
 12 files changed, 160 insertions(+), 9 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index cfb1068e122..25ad896491b 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -51,16 +51,16 @@
 #include "access/xlog_internal.h"
 #include "catalog/pg_class.h"
 #include "pagestore_client.h"
-#include "storage/relfilenode.h"
+#include "pagestore_client.h"
 #include "storage/smgr.h"
 #include "access/xlogdefs.h"
 #include "postmaster/interrupt.h"
+#include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/md.h"
 #include "fmgr.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "replication/walsender.h"
 #include "catalog/pg_tablespace_d.h"
 
 /*
@@ -781,7 +781,6 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
-
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
 	lsn = PageGetLSN(buffer);
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index b153fad594d..22ba12fceec 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -29,9 +29,11 @@
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "replication/origin.h"
+#include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "utils/memutils.h"
+#include "utils/wait_event.h"
 
 /* Buffer size required to store a compressed version of backup block image */
 #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
@@ -61,6 +63,10 @@ typedef struct
 	char		compressed_page[PGLZ_MAX_BLCKSZ];
 } registered_buffer;
 
+/* GUCs */
+int			max_replication_write_lag;
+int			max_replication_flush_lag;
+
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
 static int	max_registered_block_id = 0;	/* highest block_id + 1 currently
@@ -115,6 +121,9 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
+/* Timeout in milliseconds for delaying backend WAL inserts to avoid WAL overflow */
+#define BACK_PRESSURE_TIMEOUT 100
+#define MB ((XLogRecPtr)1024*1024)
 /*
  * Begin constructing a WAL record. This must be called before the
  * XLogRegister* functions and XLogInsert().
@@ -133,6 +142,40 @@ XLogBeginInsert(void)
 	if (begininsert_called)
 		elog(ERROR, "XLogBeginInsert was already called");
 
+	if (max_replication_write_lag != 0 || max_replication_flush_lag != 0)
+	{
+		uint64 slept = 0;
+
+		/* Suspend writes until replicas catch up */
+		while (true)
+		{
+			XLogRecPtr replicaWriteLsn;
+			XLogRecPtr replicaFlushLsn;
+			XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn);
+
+			if ((replicaWriteLsn != UnknownXLogRecPtr
+				 && myFlushLsn > replicaWriteLsn + max_replication_write_lag*MB) ||
+				(replicaFlushLsn != UnknownXLogRecPtr
+				 && myFlushLsn > replicaFlushLsn + max_replication_flush_lag*MB))
+			{
+				(void) WaitLatch(MyLatch,
+								 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+								 BACK_PRESSURE_TIMEOUT,
+								 WAIT_EVENT_BACK_PRESSURE);
+				ResetLatch(MyLatch);
+				slept += BACK_PRESSURE_TIMEOUT;
+			}
+			else
+				break;
+		}
+
+		// XXX: INFO will cause a lot of regression tests to fail.
+		if (slept > 0)
+			elog(DEBUG1, "slept for " UINT64_FORMAT " ms while waiting for all replicas to catch up", slept);
+	}
+
 	begininsert_called = true;
 }
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5bb6322d29d..53c7b06931a 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -133,6 +133,23 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	}
 }
 
+/*
+ * Get minimum of disk consistent LSNs of all safekeepers
+ */
+static XLogRecPtr
+CalculateDiskConsistentLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
+		{
+			lsn = walkeeper[i].feedback.diskConsistentLsn;
+		}
+	}
+	return lsn;
+}
+
 /* Initializes the internal event set, provided that it is currently null */
 static void
 InitEventSet(void)
@@ -339,16 +356,27 @@ HandleWalKeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	diskConsistentLsn;
 	WalMessage *msgQueueAck;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	if (minQuorumLsn > lastFeedback.flushLsn)
+	diskConsistentLsn = CalculateDiskConsistentLsn();
+
+	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
 	{
-		lastFeedback.flushLsn = minQuorumLsn;
+
+		if (minQuorumLsn > lastFeedback.flushLsn)
+			lastFeedback.flushLsn = minQuorumLsn;
+
+		lastFeedback.diskConsistentLsn = diskConsistentLsn;
+
 		/* advance the replication slot */
 		if (!syncSafekeepers)
-			ProcessStandbyReply(minQuorumLsn, minQuorumLsn, InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+			ProcessStandbyReply(lastFeedback.diskConsistentLsn,
+								lastFeedback.flushLsn,
+								InvalidXLogRecPtr, GetCurrentTimestamp(), false);
 	}
+
 	CombineHotStanbyFeedbacks(&hsFeedback);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
 	{
@@ -1056,6 +1084,17 @@ WalProposerPoll(void)
 			ResetLatch(MyLatch);
 			break;
 		}
+		if (rc == 0) /* timeout expired: poll state */
+		{
+			/*
+			 * If no WAL was generated during timeout (and we have already
+			 * collected the quorum), then send pool message
+			 */
+			if (lastSentLsn != InvalidXLogRecPtr)
+			{
+				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+			}
+		}
 	}
 }
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index c17dd98b3da..028406c4880 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3750,3 +3750,32 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 	Assert(time != 0);
 	return now - time;
 }
+
+/*
+ * Get minimal write and flush LSN among all live replicas
+ */
+void
+GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn)
+{
+	XLogRecPtr min_write_lsn = UnknownXLogRecPtr;
+	XLogRecPtr min_flush_lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < max_wal_senders; i++)
+	{
+		WalSnd	   *walsnd = &WalSndCtl->walsnds[i];
+		if (walsnd->state == WALSNDSTATE_STREAMING)
+		{
+			/*
+			 * We assume that reads from walsnd->write/flush are atomic
+			 * on all modern x64 systems, as these fields are uint64 and
+			 * should be 8-bytes aligned.
+			 */
+			XLogRecPtr written = walsnd->write;
+			XLogRecPtr flushed = walsnd->flush;
+			min_write_lsn = Min(written, min_write_lsn);
+			min_flush_lsn = Min(flushed, min_flush_lsn);
+		}
+	}
+	*write_lsn = min_write_lsn;
+	*flush_lsn = min_flush_lsn;
+}
+
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 2ec50f6d66e..c3b054d1f29 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -55,7 +55,6 @@
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
-
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 #define BufferGetLSN(bufHdr)	(PageGetLSN(BufHdrGetBlock(bufHdr)))
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 6baf67740c7..4fcf772bdf0 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -485,6 +485,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w)
 		case WAIT_EVENT_VACUUM_DELAY:
 			event_name = "VacuumDelay";
 			break;
+		case WAIT_EVENT_BACK_PRESSURE:
+			event_name = "BackPressure";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 71b43a51ef4..5f02be6e689 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2899,6 +2899,30 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal write lag between master and replicas."),
+			gettext_noop("When lag between minimal write position of replica and current LSN exeeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_write_lag,
+		1024, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
+	{
+		{"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal flush lag between master and replicas."),
+			gettext_noop("When lag between minimal flush position of replica and current LSN exeeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_flush_lag,
+		1, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."),
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index 0940b64ca6b..d44d5e64cdc 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -28,6 +28,14 @@ typedef uint64 XLogRecPtr;
 #define InvalidXLogRecPtr	0
 #define XLogRecPtrIsInvalid(r)	((r) == InvalidXLogRecPtr)
 
+/*
+ * Maximum possible XLogRecPtr value.  Currently used by back pressure
+ * mechanism to distinguish the unknown replica flush/write position.
+ * This significantly simplifies comparison and checks as we always
+ * look for the minimal value.
+ */
+#define UnknownXLogRecPtr	((XLogRecPtr) ~0)
+
 /*
  * First LSN to use for "fake" LSNs.
  *
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index f1d8c39edf1..699ca56ed25 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -38,6 +38,9 @@
 #define REGBUF_KEEP_DATA	0x10	/* include data even if a full-page image
 									 * is taken */
 
+extern int max_replication_write_lag;
+extern int max_replication_flush_lag;
+
 /* prototypes for public functions in xloginsert.c: */
 extern void XLogBeginInsert(void);
 extern void XLogSetRecordFlags(uint8 flags);
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index c6ece7a8ec7..2e32e0f0f7c 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -294,6 +294,8 @@ typedef struct AppendResponse
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
 	XLogRecPtr commitLsn;
+	// Part of WALL applied and written to the disk by all pageservers
+	XLogRecPtr diskConsistentLsn;
 	HotStandbyFeedback hs;
 } AppendResponse;
 
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 828106933ca..bd2f9ad6d28 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -12,6 +12,7 @@
 #ifndef _WALSENDER_H
 #define _WALSENDER_H
 
+#include "access/xlog.h"
 #include <signal.h>
 
 /*
@@ -47,7 +48,7 @@ extern void WalSndInitStopping(void);
 extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
-
+extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush);
 /*
  * Remember that we want to wakeup walsenders later
  *
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 6c6ec2e7118..0f87a557da7 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -140,7 +140,8 @@ typedef enum
 	WAIT_EVENT_PG_SLEEP,
 	WAIT_EVENT_RECOVERY_APPLY_DELAY,
 	WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL,
-	WAIT_EVENT_VACUUM_DELAY
+	WAIT_EVENT_VACUUM_DELAY,
+	WAIT_EVENT_BACK_PRESSURE
 } WaitEventTimeout;
 
 /* ----------

From c7ce764436dab0ead709b3bdf7737fc80339e4d1 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 19 Oct 2021 19:12:38 +0300
Subject: [PATCH 071/167] Initialize FSM/VM pages through buffer cache

To prevent loading them from pageserver.

Author: Konstantin Knizhnik with my extension to VM as well.
---
 src/backend/access/heap/visibilitymap.c   | 15 ++++++++++++---
 src/backend/storage/freespace/freespace.c | 14 +++++++++++---
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e198df65d82..addfe93eac8 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	/* Now extend the file */
 	while (vm_nblocks_now < vm_nblocks)
 	{
-		PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
+		/*
+		 * ZENITH: Initialize VM pages through buffer cache to prevent loading
+		 * them from pageserver.
+		 */
+		Buffer	buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
+											RBM_ZERO_AND_LOCK, NULL);
+		Page	page = BufferGetPage(buffer);
+
+		PageInit((Page) page, BLCKSZ, 0);
+		PageSetChecksumInplace(page, vm_nblocks_now);
+		MarkBufferDirty(buffer);
+		UnlockReleaseBuffer(buffer);
 
-		smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
-				   pg.data, false);
 		vm_nblocks_now++;
 	}
 
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 8c12dda2380..abd5f3de0cf 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -637,10 +637,18 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 
 	while (fsm_nblocks_now < fsm_nblocks)
 	{
-		PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now);
+		/*
+		 * ZENITH: Initialize FSM pages through buffer cache to prevent loading
+		 * them from pageserver.
+		 */
+		Buffer	buffer = ReadBufferExtended(rel, FSM_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL);
+		Page	page = BufferGetPage(buffer);
+
+		PageInit((Page) page, BLCKSZ, 0);
+		PageSetChecksumInplace(page, fsm_nblocks_now);
+		MarkBufferDirty(buffer);
+		UnlockReleaseBuffer(buffer);
 
-		smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
-				   pg.data, false);
 		fsm_nblocks_now++;
 	}
 

From 6e5f8d64ffacd3c5110ab068757ac4a19b46402d Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 21 Oct 2021 23:16:00 +0300
Subject: [PATCH 072/167] Turn off back pressure by default

---
 src/backend/access/transam/xloginsert.c | 3 ++-
 src/backend/utils/misc/guc.c            | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 22ba12fceec..a8263c2496a 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -121,9 +121,10 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
-/* Timeout in milliseconds for delaying backend WAL inserts to avoid WAL overflow */
+/* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */
 #define BACK_PRESSURE_TIMEOUT 100
 #define MB ((XLogRecPtr)1024*1024)
+
 /*
  * Begin constructing a WAL record. This must be called before the
  * XLogRegister* functions and XLogInsert().
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5f02be6e689..a12f0b88bbb 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2902,24 +2902,24 @@ static struct config_int ConfigureNamesInt[] =
 	{
 		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Maximal write lag between master and replicas."),
-			gettext_noop("When lag between minimal write position of replica and current LSN exeeds this value,"
+			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
 						 "backends are blocked"),
 			GUC_UNIT_MB,
 		},
 		&max_replication_write_lag,
-		1024, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 
 	{
 		{"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Maximal flush lag between master and replicas."),
-			gettext_noop("When lag between minimal flush position of replica and current LSN exeeds this value,"
+			gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value,"
 						 "backends are blocked"),
 			GUC_UNIT_MB,
 		},
 		&max_replication_flush_lag,
-		1, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 

From d6cb3fa8b5d01638e64367987fdf27fe126fd198 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 21 Oct 2021 15:59:55 +0300
Subject: [PATCH 073/167] ShutdownConnection instead of ResetConnection in more
 places.

At least currently risk of busy loop (e.g due to bugs) is much higher than
benefit of additional availability if we immediately reconnect; add interval
between the reconnection attempts.
---
 src/backend/replication/walproposer.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 53c7b06931a..667d18ba56e 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1213,7 +1213,7 @@ AdvancePollState(int i, uint32 events)
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
 						 wk->host, wk->port, walprop_error_message(wk->conn));
-					ResetConnection(i);
+					ShutdownConnection(i);
 					return;
 				}
 
@@ -1252,7 +1252,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_FAILED:
 						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
 							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ResetConnection(i);
+						ShutdownConnection(i);
 						return;
 
 						/*
@@ -1263,7 +1263,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_UNEXPECTED_SUCCESS:
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
 							 wk->host, wk->port);
-						ResetConnection(i);
+						ShutdownConnection(i);
 						return;
 				}
 				break;
@@ -1662,7 +1662,7 @@ AsyncRead(int i, void *value, size_t value_size)
 				 wk->host, wk->port,
 				 FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ResetConnection(i);
+			ShutdownConnection(i);
 			return false;
 	}
 
@@ -1707,7 +1707,7 @@ BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 			 wk->host, wk->port, FormatWalKeeperState(wk->state),
 			 walprop_error_message(wk->conn));
-		ResetConnection(i);
+		ShutdownConnection(i);
 		return false;
 	}
 
@@ -1756,7 +1756,7 @@ AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKee
 			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 				 wk->host, wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ResetConnection(i);
+			ShutdownConnection(i);
 			return false;
 	}
 

From bcbf12ca78ce2ac41a6e5766bc32d8b66f0a637c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 3 Nov 2021 19:55:40 +0200
Subject: [PATCH 074/167] Handle partial writes to stdout in WAL redo process.

---
 src/backend/tcop/zenith_wal_redo.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 9e1620922ce..0ddd2ddec24 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -660,6 +660,7 @@ GetPage(StringInfo input_message)
 	BlockNumber blknum;
 	Buffer		buf;
 	Page		page;
+	int			tot_written;
 
 	/*
 	 * message format:
@@ -683,7 +684,21 @@ GetPage(StringInfo input_message)
 	/* single thread, so don't bother locking the page */
 
 	/* Response: Page content */
-	write(STDOUT_FILENO, page, BLCKSZ); /* FIXME: check errors */
+	tot_written = 0;
+	do {
+		ssize_t		rc;
+
+		rc = write(STDOUT_FILENO, &page[tot_written], BLCKSZ - tot_written);
+		if (rc < 0) {
+			/* If interrupted by signal, just retry */
+			if (errno == EINTR)
+				continue;
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to stdout: %m")));
+		}
+		tot_written += rc;
+	} while (tot_written < BLCKSZ);
 
 	ReleaseBuffer(buf);
 	DropDatabaseBuffers(rnode.dbNode);

From a3e992fb518f27cd05ecca7aa2df42787f8baebd Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 8 Nov 2021 15:59:23 +0300
Subject: [PATCH 075/167] Handle keepalives while receiving WAL in recovery.

Since c310932 safekeeper sometimes sends it.

ref #843
---
 src/backend/replication/walproposer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 667d18ba56e..618c8992f57 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -978,7 +978,9 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 			}
 			else
 			{
-				Assert(buf[0] == 'w');
+				Assert(buf[0] == 'w' || buf[0] == 'k');
+				if (buf[0] == 'k')
+					continue; /* keepalive */
 				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
 					   sizeof rec_start_lsn);
 				rec_start_lsn = pg_ntoh64(rec_start_lsn);

From 1bb6aa910d6de92da6a2b44b3eda6f85a6381dff Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 9 Nov 2021 23:45:50 +0300
Subject: [PATCH 076/167] Fix truncateLsn update (#101)

truncateLsn is now advanced to `Min(walkeeper[i].feedback.flushLsn)` with taking epochs into account.
---
 src/backend/replication/walproposer.c | 85 ++++++++++++---------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 618c8992f57..d99f9025b0c 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -81,7 +81,6 @@ static AppendResponse lastFeedback;
  *  record-aligned (first record which might not yet received by someone).
  */
 static XLogRecPtr truncateLsn;
-static XLogRecPtr candidateTruncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
 static term_t propTerm;			/* term of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
@@ -150,6 +149,26 @@ CalculateDiskConsistentLsn(void)
 	return lsn;
 }
 
+/*
+ * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
+ * last WAL record that can be safely discarded.
+ */
+static XLogRecPtr
+CalculateMinFlushLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		/* We can't rely on safekeeper flushLsn if it has wrong epoch */
+		if (walkeeper[i].feedback.epoch != propTerm)
+			return 0;
+
+		if (walkeeper[i].feedback.flushLsn < lsn)
+			lsn = walkeeper[i].feedback.flushLsn;
+	}
+	return lsn;
+}
+
 /* Initializes the internal event set, provided that it is currently null */
 static void
 InitEventSet(void)
@@ -357,7 +376,7 @@ HandleWalKeeperResponse(void)
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
 	XLogRecPtr	diskConsistentLsn;
-	WalMessage *msgQueueAck;
+	XLogRecPtr  minFlushLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	diskConsistentLsn = CalculateDiskConsistentLsn();
@@ -389,39 +408,24 @@ HandleWalKeeperResponse(void)
 									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 
-	/* Advance truncateLsn */
-	msgQueueAck = msgQueueHead;
-	while (msgQueueAck != NULL && msgQueueAck->ackMask == ((1 << n_walkeepers) - 1))
-	{
-		/*
-		 * This piece is received by everyone; try to advance truncateLsn, but
-		 * hold it back to nearest commitLsn. Thus we will always start
-		 * streaming from the beginning of the record, which simplifies
-		 * decoding on the far end.
-		 *
-		 * This also prevents surprising violation of truncateLsn <= commitLsn
-		 * invariant which might occur because 1) truncateLsn can be advanced
-		 * immediately once chunk is broadcast to all safekeepers, and
-		 * commitLsn generally can't be advanced based on feedback from
-		 * safekeeper who is still in the previous epoch (similar to 'leader
-		 * can't commit entries from previous term' in Raft); 2) chunks we
-		 * read from WAL and send are plain sheets of bytes, but safekeepers
-		 * ack only on commit boundaries.
-		 */
-		if (msgQueueAck->req.endLsn >= minQuorumLsn && minQuorumLsn != InvalidXLogRecPtr)
-		{
-			truncateLsn = minQuorumLsn;
-			candidateTruncateLsn = InvalidXLogRecPtr;
-		}
-		else if (msgQueueAck->req.endLsn >= candidateTruncateLsn &&
-				 candidateTruncateLsn != InvalidXLogRecPtr)
-		{
-			truncateLsn = candidateTruncateLsn;
-			candidateTruncateLsn = InvalidXLogRecPtr;
-		}
-
-		msgQueueAck = msgQueueAck->next;
-	}
+	/*
+	 * Try to advance truncateLsn to minFlushLsn, which is the last record
+	 * flushed to all safekeepers. We must always start streaming from the 
+	 * beginning of the record, which simplifies decoding on the far end.
+	 *
+	 * Advanced truncateLsn should be not further than nearest commitLsn.
+	 * This prevents surprising violation of truncateLsn <= commitLsn
+	 * invariant which might occur because 1) truncateLsn can be advanced
+	 * immediately once chunk is broadcast to all safekeepers, and
+	 * commitLsn generally can't be advanced based on feedback from
+	 * safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2) chunks we
+	 * read from WAL and send are plain sheets of bytes, but safekeepers
+	 * ack only on record boundaries.
+	 */
+	minFlushLsn = CalculateMinFlushLsn();
+	if (minFlushLsn > truncateLsn)
+		truncateLsn = minFlushLsn;
 
 	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
 	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
@@ -1604,17 +1608,6 @@ AdvancePollState(int i, uint32 events)
 					if (minQuorumLsn > lastSentCommitLsn)
 					{
 						BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-
-						/*
-						 * commitLsn is always the record boundary; remember
-						 * it so we can advance truncateLsn there. But do so
-						 * only if previous value is applied, otherwise it
-						 * might never catch up.
-						 */
-						if (candidateTruncateLsn == InvalidXLogRecPtr)
-						{
-							candidateTruncateLsn = minQuorumLsn;
-						}
 						lastSentCommitLsn = minQuorumLsn;
 					}
 					break;

From b2e8da8bda8c13fa9c517e9aec766b0418771fcc Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 10 Nov 2021 18:48:01 +0300
Subject: [PATCH 077/167] [walproposer] Get rid of SAB_Error after rebase

Also see 1632ea43 for details.
---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d99f9025b0c..5512df16664 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -870,7 +870,7 @@ DetermineEpochStartLsn(void)
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
-		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, SAB_Error);
+		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
 		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
 		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));

From d4e813b23f630ee69180b2a79496be44ad5357d4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 5 Nov 2021 13:49:50 +0300
Subject: [PATCH 078/167] Add term history to safekeepers.

See corresponding zenith commit.
---
 src/backend/replication/walproposer.c       | 531 +++++++++++++-------
 src/backend/replication/walproposer_utils.c |  51 ++
 src/include/replication/walproposer.h       |  57 ++-
 3 files changed, 456 insertions(+), 183 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5512df16664..3defed7f9ab 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -43,6 +43,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "access/xlog.h"
+#include "libpq/pqformat.h"
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "postmaster/bgworker.h"
@@ -82,6 +83,7 @@ static AppendResponse lastFeedback;
  */
 static XLogRecPtr truncateLsn;
 static VoteRequest voteRequest; /* Vote request for walkeeper */
+static TermHistory propTermHistory; /* term history of the proposer */
 static term_t propTerm;			/* term of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
 static term_t donorEpoch;		/* Most advanced acceptor epoch */
@@ -95,13 +97,19 @@ static bool syncSafekeepers;
 
 /* Declarations of a few functions ahead of time, so that we can define them out of order. */
 static void AdvancePollState(int i, uint32 events);
-static bool AsyncRead(int i, void *value, size_t value_size);
+static bool AsyncRead(int i, char **buf, int *buf_size);
+static bool AsyncReadFixed(int i, void *value, size_t value_size);
+static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
-static bool AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state);
-static bool AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state);
-static void HackyRemoveWalProposerEvent(int to_remove);
-static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
+static bool AsyncFlush(int i, bool socket_read_ready);
+static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
 static void BroadcastMessage(WalMessage *msg);
+static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static term_t GetHighestTerm(TermHistory *th);
+static term_t GetEpoch(WalKeeper *wk);
+static void SendProposerElected(WalKeeper *wk);
+static void StartStreaming(WalKeeper *wk);
 
 
 /*
@@ -159,10 +167,6 @@ CalculateMinFlushLsn(void)
 	XLogRecPtr lsn = UnknownXLogRecPtr;
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		/* We can't rely on safekeeper flushLsn if it has wrong epoch */
-		if (walkeeper[i].feedback.epoch != propTerm)
-			return 0;
-
 		if (walkeeper[i].feedback.flushLsn < lsn)
 			lsn = walkeeper[i].feedback.flushLsn;
 	}
@@ -205,7 +209,7 @@ UpdateEventSet(WalKeeper *wk, uint32 events)
  * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
  */
 static void
-HackyRemoveWalProposerEvent(int to_remove)
+HackyRemoveWalProposerEvent(WalKeeper *to_remove)
 {
 	/* Remove the existing event set */
 	if (waitEvents)
@@ -228,7 +232,7 @@ HackyRemoveWalProposerEvent(int to_remove)
 
 		wk->eventPos = -1;
 
-		if (i == to_remove)
+		if (wk == to_remove)
 			continue;
 
 		/* If this WAL keeper isn't offline, add an event for it! */
@@ -241,15 +245,18 @@ HackyRemoveWalProposerEvent(int to_remove)
 
 /* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
 static void
-ShutdownConnection(int i)
+ShutdownConnection(WalKeeper *wk)
 {
-	if (walkeeper[i].conn)
-		walprop_finish(walkeeper[i].conn);
-	walkeeper[i].conn = NULL;
-	walkeeper[i].state = SS_OFFLINE;
-	walkeeper[i].currMsg = NULL;
-
-	HackyRemoveWalProposerEvent(i);
+	if (wk->conn)
+		walprop_finish(wk->conn);
+	wk->conn = NULL;
+	wk->state = SS_OFFLINE;
+	wk->currMsg = NULL;
+	if (wk->voteResponse.termHistory.entries)
+		pfree(wk->voteResponse.termHistory.entries);
+	wk->voteResponse.termHistory.entries = NULL;
+
+	HackyRemoveWalProposerEvent(wk);
 }
 
 /*
@@ -259,14 +266,13 @@ ShutdownConnection(int i)
  * On success, sets the state to SS_CONNECTING_WRITE.
  */
 static void
-ResetConnection(int i)
+ResetConnection(WalKeeper *wk)
 {
 	pgsocket	sock;			/* socket of the new connection */
-	WalKeeper  *wk = &walkeeper[i];
 
 	if (wk->state != SS_OFFLINE)
 	{
-		ShutdownConnection(i);
+		ShutdownConnection(wk);
 	}
 
 	/*
@@ -354,13 +360,11 @@ GetAcknowledgedByQuorumWALPosition(void)
 	for (int i = 0; i < n_walkeepers; i++)
 	{
 		/*
-		 * Note that while we haven't pushed WAL up to epoch start lsn to the
-		 * majority we don't really know which LSN is reliably committed as
-		 * reported flush_lsn is physical end of wal, which can contain
-		 * diverged history (compared to donor).
+		 * Like in Raft, we aren't allowed to commit entries from previous
+		 * terms, so ignore reported LSN until it gets to epochStartLsn.
 		 */
-		responses[i] = walkeeper[i].feedback.epoch == propTerm
-			? walkeeper[i].feedback.flushLsn : 0;
+		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
+			walkeeper[i].feedback.flushLsn : 0;
 	}
 	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
 
@@ -529,6 +533,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 * `ResetConnection` as needed
 		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
+		initStringInfo(&walkeeper[n_walkeepers].outbuf);
 		walkeeper[n_walkeepers].currMsg = NULL;
 		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_walkeepers += 1;
@@ -575,7 +580,7 @@ WalProposerStart(void)
 	/* Initiate connections to all walkeeper nodes */
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		ResetConnection(i);
+		ResetConnection(&walkeeper[i]);
 	}
 
 	WalProposerLoop();
@@ -831,17 +836,33 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	return msg;
 }
 
+/* latest term in TermHistory, or 0 is there is no entries */
+static term_t
+GetHighestTerm(TermHistory *th)
+{
+	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
+}
+
+/* safekeeper's epoch is the term of the highest entry in the log */
+static term_t
+GetEpoch(WalKeeper *wk)
+{
+	return GetHighestTerm(&wk->voteResponse.termHistory);
+}
 
 /*
  * Called after majority of acceptors gave votes, it calculates the most
  * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
  * which we'll write WAL in our term.
- * Sets truncateLsn along the way (though it
- * is not of much use at this point).
+ *
+ * Sets truncateLsn along the way (though it is not of much use at this point --
+ * only for skipping recovery).
  */
 static void
 DetermineEpochStartLsn(void)
 {
+	TermHistory *dth;
+
 	propEpochStartLsn = InvalidXLogRecPtr;
 	donorEpoch = 0;
 	truncateLsn = InvalidXLogRecPtr;
@@ -850,11 +871,11 @@ DetermineEpochStartLsn(void)
 	{
 		if (walkeeper[i].state == SS_IDLE)
 		{
-			if (walkeeper[i].voteResponse.epoch > donorEpoch ||
-				(walkeeper[i].voteResponse.epoch == donorEpoch &&
+			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
+				(GetEpoch(&walkeeper[i]) == donorEpoch &&
 				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
 			{
-				donorEpoch = walkeeper[i].voteResponse.epoch;
+				donorEpoch = GetEpoch(&walkeeper[i]);
 				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
 				donor = i;
 			}
@@ -884,6 +905,16 @@ DetermineEpochStartLsn(void)
 	Assert((truncateLsn != InvalidXLogRecPtr) ||
 		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
 
+	/*
+	 * Proposer's term history is the donor's + its own entry.
+	 */
+	dth = &walkeeper[donor].voteResponse.termHistory;
+	propTermHistory.n_entries = dth->n_entries + 1;
+	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
+	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
+	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
+
 	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		 quorum,
 		 propTerm,
@@ -926,7 +957,7 @@ ReconnectWalKeepers(void)
 		for (int i = 0; i < n_walkeepers; i++)
 		{
 			if (walkeeper[i].state == SS_OFFLINE)
-				ResetConnection(i);
+				ResetConnection(&walkeeper[i]);
 		}
 	}
 }
@@ -1008,44 +1039,129 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 		return false;
 	}
 
-	/*
-	 * Start sending entries to everyone from the beginning (truncateLsn),
-	 * except for those who lives in donor's epoch and thus for sure has
-	 * correct WAL. We could do here even slightly better, taking into account
-	 * commitLsn of the rest to avoid sending them excessive data.
+	return true;
+}
+
+/*
+ * Determine for wk the starting streaming point and send it message
+ * 1) Announcing we are elected proposer (which immediately advances epoch if
+ *    safekeeper is synced, being important for sync-safekeepers)
+ * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
+ *    beyond it -- and history of term switching.
+ * 
+ * Sets wk->startStreamingAt.
+ */
+static void
+SendProposerElected(WalKeeper *wk)
+{
+	ProposerElected msg;
+	TermHistory *th;
+	term_t lastCommonTerm;
+	int i;
+
+	/* 
+	 * Determine start LSN by comparing safekeeper's log term switch history and
+	 * proposer's, searching for the divergence point.
+	 *
+	 * Note: there is a vanishingly small chance of no common point even if
+	 * there is some WAL on safekeeper, if immediately after bootstrap compute
+	 * wrote some WAL on single sk and died; we stream since the beginning then.
 	 */
-	for (int i = 0; i < n_walkeepers; i++)
+	th = &wk->voteResponse.termHistory;
+	/* 
+	 * If any WAL is present on the sk, it must be authorized by some term.
+	 * OTOH, without any WAL there are no term swiches in the log.
+	 */
+	Assert((th->n_entries == 0) ==
+		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
+	/* We must start somewhere. */
+	Assert(propTermHistory.n_entries >= 1);
+
+	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
 	{
-		if (walkeeper[i].state != SS_IDLE)
-			continue;
+		if (propTermHistory.entries[i].term != th->entries[i].term)
+			break;
+		/* term must begin everywhere at the same point */
+		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
+	}
+	i--; /* step back to the last common term */
+	if (i < 0)
+	{
+		/* safekeeper is empty or no common point, start from the beginning */
+		wk->startStreamingAt = propTermHistory.entries[0].lsn;
+	}
+	else
+	{
+		/*
+		 * End of (common) term is the start of the next except it is the last
+		 * one; there it is flush_lsn in case of safekeeper or, in case of
+		 * proposer, LSN it is currently writing, but then we just pick
+		 * safekeeper pos as it obviously can't be higher.
+		 */
+		if (propTermHistory.entries[i].term == propTerm)
+		{
+			wk->startStreamingAt = wk->voteResponse.flushLsn;
+		}
+		else
+		{
+			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
+			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
+														   wk->voteResponse.flushLsn);
+			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
+		}
+	}
+
+	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+
+	msg.tag = 'e';
+	msg.term = propTerm;
+	msg.startStreamingAt = wk->startStreamingAt;
+	msg.termHistory = &propTermHistory;
+
+	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
+	elog(LOG,
+		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
+		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
+	
+	resetStringInfo(&wk->outbuf);
+	pq_sendint64_le(&wk->outbuf, msg.tag);
+	pq_sendint64_le(&wk->outbuf, msg.term);
+	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
+	for (int i = 0; i < msg.termHistory->n_entries; i++)
+	{
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
+	}
+
+	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+		return;
+
+	StartStreaming(wk);
+}
 
-		if (walkeeper[i].voteResponse.epoch != donorEpoch)
+/*
+ * Start streaming to safekeeper wk.
+ */
+static void
+StartStreaming(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+
+	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
+	{
+		if (msg->req.endLsn <= wk->startStreamingAt)
 		{
-			SendMessageToNode(i, msgQueueHead);
+			/* message is already received by this walkeeper */
+			msg->ackMask |= 1 << wki;
 		}
 		else
 		{
-			for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-			{
-				if (msg->req.endLsn <= walkeeper[i].voteResponse.flushLsn)
-				{
-					/* message is already received by this walkeeper */
-					msg->ackMask |= 1 << i;
-				}
-				else
-				{
-					/*
-					 * By convention we always stream since the beginning of
-					 * the record, and flushLsn points to it.
-					 */
-					walkeeper[i].startStreamingAt = walkeeper[i].voteResponse.flushLsn;
-					SendMessageToNode(i, msg);
-					break;
-				}
-			}
+			SendMessageToNode(wki, msg);
+			return;
 		}
 	}
-	return true;
+	wk->state = SS_IDLE; /* nothing to send yet, safekeeper is recovered */
 }
 
 /*
@@ -1196,7 +1312,7 @@ AdvancePollState(int i, uint32 events)
 							 * restart at a slower interval on calls to
 							 * ReconnectWalKeepers.
 							 */
-							ShutdownConnection(i);
+							ShutdownConnection(wk);
 							return;
 					}
 
@@ -1205,7 +1321,7 @@ AdvancePollState(int i, uint32 events)
 					 * un-register the old event and re-register an event on
 					 * the new socket.
 					 */
-					HackyRemoveWalProposerEvent(i);
+					HackyRemoveWalProposerEvent(wk);
 					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
 					break;
 				}
@@ -1219,7 +1335,7 @@ AdvancePollState(int i, uint32 events)
 				{
 					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
 						 wk->host, wk->port, walprop_error_message(wk->conn));
-					ShutdownConnection(i);
+					ShutdownConnection(wk);
 					return;
 				}
 
@@ -1258,7 +1374,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_FAILED:
 						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
 							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(i);
+						ShutdownConnection(wk);
 						return;
 
 						/*
@@ -1269,7 +1385,7 @@ AdvancePollState(int i, uint32 events)
 					case WP_EXEC_UNEXPECTED_SUCCESS:
 						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
 							 wk->host, wk->port);
-						ShutdownConnection(i);
+						ShutdownConnection(wk);
 						return;
 				}
 				break;
@@ -1301,7 +1417,7 @@ AdvancePollState(int i, uint32 events)
 				 * error handling or state setting is taken care of. We can
 				 * leave any other work until later.
 				 */
-				if (!AsyncRead(i, &wk->greet, sizeof(wk->greet)))
+				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
 					return;
 
 				/* Protocol is all good, move to voting. */
@@ -1381,11 +1497,12 @@ AdvancePollState(int i, uint32 events)
 			case SS_VOTING:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(i);
+				ResetConnection(wk);
 				break;
 
 				/* We have quorum for voting, send our vote request */
 			case SS_SEND_VOTE:
+				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
 				/* On failure, logging & resetting is handled */
 				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
 					return;
@@ -1395,18 +1512,13 @@ AdvancePollState(int i, uint32 events)
 
 				/* Start reading the walkeeper response for our candidate */
 			case SS_WAIT_VERDICT:
-
-				/*
-				 * If our reading doesn't immediately succeed, any necessary
-				 * error handling or state setting is taken care of. We can
-				 * leave any other work until later.
-				 */
-				if (!AsyncRead(i, &wk->voteResponse, sizeof(wk->voteResponse)))
+				wk->voteResponse.apm.tag = 'v';
+				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
 					return;
 
 				elog(LOG,
 					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-					 wk->host, wk->port, wk->voteResponse.voteGiven, wk->voteResponse.epoch,
+					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
 					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
 					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
 
@@ -1426,18 +1538,16 @@ AdvancePollState(int i, uint32 events)
 				Assert(wk->voteResponse.term == propTerm);
 
 				/* Handshake completed, do we have quorum? */
-
-				if (++n_votes != quorum)
+				n_votes++;
+				if (n_votes < quorum)
+				{
+					wk->state = SS_IDLE; /* can't do much yet, no quorum */
+				}
+				else if (n_votes > quorum)
 				{
-					/* Can't start streaming earlier than truncateLsn */
-					wk->startStreamingAt = truncateLsn;
-					Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
 
-					/*
-					 * We are already streaming WAL: send all pending messages
-					 * to the attached walkeeper
-					 */
-					SendMessageToNode(i, msgQueueHead);
+					/* recovery already performed, just start streaming */
+					SendProposerElected(wk);
 				}
 				else
 				{
@@ -1461,20 +1571,6 @@ AdvancePollState(int i, uint32 events)
 						/* Perform recovery */
 						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
 							elog(FATAL, "Failed to recover state");
-
-						/*
-						 * This message signifies epoch switch; it is needed
-						 * to make the switch happen on donor, as he won't get
-						 * any other messages until we start writing new WAL
-						 * (and we e.g. don't in --sync mode at all)
-						 */
-						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
-
-						if (syncSafekeepers)
-						{
-							/* keep polling until all walkeepers are synced */
-							return;
-						}
 					}
 					else if (syncSafekeepers)
 					{
@@ -1482,12 +1578,50 @@ AdvancePollState(int i, uint32 events)
 						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
 						exit(0);
 					}
+
+					for (int i = 0; i < n_walkeepers; i++)
+					{
+						if (walkeeper[i].state == SS_IDLE)
+							SendProposerElected(&walkeeper[i]);
+					}
+
+					if (syncSafekeepers)
+					{
+						/*
+						 * Queue empty message to enforce receiving feedback
+						 * even from nodes who are fully recovered; this is
+						 * required to learn they switched epoch which finishes
+						 * sync-safeekepers who doesn't generate any real new
+						 * records. Will go away once we switch to async acks.
+						 */
+						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+
+						/* keep polling until all walkeepers are synced */
+						return;
+					}
+
 					WalProposerStartStreaming(propEpochStartLsn);
 					/* Should not return here */
 				}
 
 				break;
 
+			/* Flush proposer announcement message */
+			case SS_SEND_ELECTED_FLUSH:
+
+				/*
+				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll
+				 * wait until the next poll comes along.
+				 */
+				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
+					return;
+				
+				StartStreaming(wk);
+
+				break;
+
+
 				/*
 				 * Idle state for sending WAL. Moved out only by calls to
 				 * SendMessageToNode
@@ -1495,7 +1629,7 @@ AdvancePollState(int i, uint32 events)
 			case SS_IDLE:
 				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
 					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(i);
+				ResetConnection(wk);
 				break;
 
 				/*
@@ -1543,15 +1677,16 @@ AdvancePollState(int i, uint32 events)
 					 * message is stored after the end of the WalMessage
 					 * struct, in the allocation for each msg
 					 */
-					if (!AsyncWrite(i, req,
+					if (!AsyncWrite(wk, req,
 									sizeof(AppendRequestHeader) + req->endLsn -
 									req->beginLsn,
-									SS_SEND_WAL_FLUSH, SS_RECV_FEEDBACK))
+									SS_SEND_WAL_FLUSH))
 					{
 						if (req != &msg->req)
 							free(req);
 						return;
 					}
+					wk->state = SS_RECV_FEEDBACK;
 					if (req != &msg->req)
 						free(req);
 
@@ -1566,9 +1701,11 @@ AdvancePollState(int i, uint32 events)
 				 * the flush completes. If we still have more to do, we'll
 				 * wait until the next poll comes along.
 				 */
-				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0, SS_RECV_FEEDBACK))
+				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
 					return;
 
+				wk->state = SS_RECV_FEEDBACK;
+
 				break;
 
 				/*
@@ -1585,7 +1722,7 @@ AdvancePollState(int i, uint32 events)
 					 * necessary error handling or state setting is taken care
 					 * of. We can leave any other work until later.
 					 */
-					if (!AsyncRead(i, &wk->feedback, sizeof(wk->feedback)))
+					if (!AsyncReadFixed(i, &wk->feedback, sizeof(wk->feedback)))
 						return;
 
 					next = wk->currMsg->next;
@@ -1622,44 +1759,52 @@ AdvancePollState(int i, uint32 events)
 	}
 }
 
-/*
- * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
- * returning whether the read was successful.
- *
- * If the read needs more polling, we return 'false' and keep the state
- * unmodified, waiting until it becomes read-ready to try again. If it fully
- * failed, a warning is emitted and the connection is reset.
+/* 
+ * Try to read CopyData message from i'th safekeeper, resetting connection on
+ * failure.
  */
 static bool
-AsyncRead(int i, void *value, size_t value_size)
+AsyncRead(int i, char **buf, int *buf_size)
 {
 	WalKeeper  *wk = &walkeeper[i];
-	char	   *buf = NULL;
-	int			buf_size = -1;
-	uint32		events;
 
-	switch (walprop_async_read(wk->conn, &buf, &buf_size))
+	switch (walprop_async_read(wk->conn, buf, buf_size))
 	{
-			/* On success, there's just a couple more things we'll check below */
 		case PG_ASYNC_READ_SUCCESS:
-			break;
+			return true;
 
-			/*
-			 * If we need more input, wait until the socket is read-ready and
-			 * try again.
-			 */
 		case PG_ASYNC_READ_TRY_AGAIN:
-			UpdateEventSet(wk, WL_SOCKET_READABLE);
+			/* WL_SOCKET_READABLE is always set during copyboth */
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			elog(WARNING, "Failed to read from node %s:%s in %s state: %s",
-				 wk->host, wk->port,
-				 FormatWalKeeperState(wk->state),
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", wk->host,
+				 wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ShutdownConnection(i);
+			ShutdownConnection(wk);
 			return false;
 	}
+	Assert(false);
+	return false;
+}
+
+/*
+ * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
+ * returning whether the read was successful.
+ *
+ * If the read needs more polling, we return 'false' and keep the state
+ * unmodified, waiting until it becomes read-ready to try again. If it fully
+ * failed, a warning is emitted and the connection is reset.
+ */
+static bool
+AsyncReadFixed(int i, void *value, size_t value_size)
+{
+	WalKeeper  *wk = &walkeeper[i];
+	char	   *buf = NULL;
+	int			buf_size = -1;
+
+	if (!(AsyncRead(i, &buf, &buf_size)))
+		return false;
 
 	/*
 	 * If we get here, the read was ok, but we still need to check it was the
@@ -1677,14 +1822,68 @@ AsyncRead(int i, void *value, size_t value_size)
 	/* Copy the resulting info into place */
 	memcpy(value, buf, buf_size);
 
-	/* Update the events for the WalKeeper, if it's going to wait */
-	events = WalKeeperStateDesiredEvents(wk->state);
-	if (events)
-		UpdateEventSet(wk, events);
-
 	return true;
 }
 
+/*
+ * Read next message with known type into provided struct. 
+ * TODO: migrate AsyncReadFixed here for all messages
+ */
+static bool
+AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
+{
+	WalKeeper  *wk = &walkeeper[i];
+	char *buf;
+	int buf_size;
+	uint64 tag;
+	StringInfoData s;
+
+	if (!(AsyncRead(i, &buf, &buf_size)))
+		return false;
+
+	/* parse it */
+	s.data = buf;
+	s.len = buf_size;
+	s.cursor = 0;
+
+	tag = pq_getmsgint64_le(&s);
+	if (tag != anymsg->tag)
+	{
+		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, wk->host,
+			 wk->port, FormatWalKeeperState(wk->state));
+		ResetConnection(wk);
+		return false;
+	}
+
+	switch (tag)
+	{
+		case 'v':
+		{
+			VoteResponse *msg = (VoteResponse *) anymsg;
+
+			msg->term = pq_getmsgint64_le(&s);
+			msg->voteGiven = pq_getmsgint64_le(&s);
+			msg->flushLsn = pq_getmsgint64_le(&s);
+			msg->truncateLsn = pq_getmsgint64_le(&s);
+			msg->termHistory.n_entries = pq_getmsgint32_le(&s);
+			msg->termHistory.entries = palloc(sizeof(TermSwitchEntry) * msg->termHistory.n_entries);
+			for (int i = 0; i < msg->termHistory.n_entries; i++)
+			{
+				msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
+				msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
+			}
+			pq_getmsgend(&s);
+			return true;
+		}
+
+		default:
+		{
+			Assert(false);
+			return false;
+		}
+	}
+}
+
 /*
  * Blocking equivalent to AsyncWrite.
  *
@@ -1702,7 +1901,7 @@ BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 			 wk->host, wk->port, FormatWalKeeperState(wk->state),
 			 walprop_error_message(wk->conn));
-		ShutdownConnection(i);
+		ShutdownConnection(wk);
 		return false;
 	}
 
@@ -1721,23 +1920,18 @@ BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
 
 /*
  * Starts a write into the 'i'th WAL keeper's postgres connection, moving to
- * success_state only when the write succeeds. If the write needs flushing,
- * moves to flush_state.
+ * flush_state (adjusting eventset) if write still needs flushing.
  *
- * Returns false only if the write immediately fails. Upon failure, a warning is
- * emitted and the connection is reset.
+ * Returns false if sending is unfinished (requires flushing or conn failed).
+ * Upon failure, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKeeperState success_state)
+AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state)
 {
-	WalKeeper  *wk = &walkeeper[i];
-	uint32		events;
-
 	switch (walprop_async_write(wk->conn, msg, msg_size))
 	{
 		case PG_ASYNC_WRITE_SUCCESS:
-			wk->state = success_state;
-			break;
+			return true;
 		case PG_ASYNC_WRITE_TRY_FLUSH:
 
 			/*
@@ -1746,37 +1940,30 @@ AsyncWrite(int i, void *msg, size_t msg_size, WalKeeperState flush_state, WalKee
 			 * this function
 			 */
 			wk->state = flush_state;
-			break;
+			UpdateEventSet(wk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
 				 wk->host, wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ShutdownConnection(i);
+			ShutdownConnection(wk);
+			return false;
+		default:
+		    Assert(false);
 			return false;
 	}
-
-	/* If the new state will be waiting for something, update the event set */
-	events = WalKeeperStateDesiredEvents(wk->state);
-	if (events)
-		UpdateEventSet(wk, events);
-
-	return true;
 }
 
 /*
  * Flushes a previous call to AsyncWrite. This only needs to be called when the
  * socket becomes read or write ready *after* calling AsyncWrite.
  *
- * If flushing completes, moves to 'success_state' and returns true. If more
- * flushes are needed, does nothing and returns true.
- *
- * On failure, emits a warning, resets the connection, and returns false.
+ * If flushing successfully completes returns true, otherwise false.
  */
 static bool
-AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
+AsyncFlush(int i, bool socket_read_ready)
 {
 	WalKeeper  *wk = &walkeeper[i];
-	uint32		events;
 
 	/*---
 	 * PQflush returns:
@@ -1787,27 +1974,21 @@ AsyncFlush(int i, bool socket_read_ready, WalKeeperState success_state)
 	switch (walprop_flush(wk->conn, socket_read_ready))
 	{
 		case 0:
-			/* On success, move to the next state - that logic is further down */
-			break;
+			UpdateEventSet(wk, WL_SOCKET_READABLE); /* flush is done, unset write interest */
+			return true;
 		case 1:
 			/* Nothing to do; try again when the socket's ready */
-			return true;
+			return false;
 		case -1:
 			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
 				 wk->host, wk->port, FormatWalKeeperState(wk->state),
 				 walprop_error_message(wk->conn));
-			ResetConnection(i);
+			ResetConnection(wk);
+			return false;
+		default:
+			Assert(false);
 			return false;
 	}
-
-	wk->state = success_state;
-
-	/* If the new state will be waiting for something, update the event set */
-	events = WalKeeperStateDesiredEvents(wk->state);
-	if (events)
-		UpdateEventSet(wk, events);
-
-	return true;
 }
 
 /*
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 16d84ac7f17..924b8fb1eb7 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -1,6 +1,7 @@
 #include "postgres.h"
 
 #include "replication/walproposer.h"
+#include "libpq/pqformat.h"
 #include "common/logging.h"
 #include "common/ip.h"
 #include "../interfaces/libpq/libpq-fe.h"
@@ -68,6 +69,9 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_WAIT_VERDICT:
 			return_val = "wait-for-verdict";
 			break;
+		case SS_SEND_ELECTED_FLUSH:
+			return_val = "send-announcement-flush";
+			break;
 		case SS_IDLE:
 			return_val = "idle";
 			break;
@@ -151,6 +155,7 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 			result = WL_NO_EVENTS;
 			break;
 		/* but flushing does require read- or write-ready */
+		case SS_SEND_ELECTED_FLUSH:
 		case SS_SEND_WAL_FLUSH:
 			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
 			break;
@@ -266,3 +271,49 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 
 	return true;
 }
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
\ No newline at end of file
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 2e32e0f0f7c..30d8d72256c 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -148,6 +148,9 @@ typedef enum
 	 */
 	SS_WAIT_VERDICT,
 
+	/* need to flush ProposerAnnouncement */
+	SS_SEND_ELECTED_FLUSH,
+
 	/*
 	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
 	 * read-ready, the connection has been closed.
@@ -180,7 +183,7 @@ typedef enum
 typedef uint64 term_t;
 
 /*
- * Proposer -> Acceptor messaging.
+ * Proposer <-> Acceptor messaging.
  */
 
 /* Initial Proposer -> Acceptor message */
@@ -197,6 +200,11 @@ typedef struct ProposerGreeting
 	uint32	   walSegSize;
 } ProposerGreeting;
 
+typedef struct AcceptorProposerMessage
+{
+	uint64 tag;
+} AcceptorProposerMessage;
+
 /*
  * Acceptor -> Proposer initial response: the highest term acceptor voted for.
  */
@@ -216,17 +224,47 @@ typedef struct VoteRequest
 	pg_uuid_t   proposerId; /* for monitoring/debugging */
 } VoteRequest;
 
+/* Element of term switching chain. */
+typedef struct TermSwitchEntry
+{
+	term_t term;
+	XLogRecPtr lsn;
+} TermSwitchEntry;
+
+typedef struct TermHistory
+{
+	uint32 n_entries;
+	TermSwitchEntry *entries;
+} TermHistory;
+
 /* Vote itself, sent from safekeeper to proposer */
 typedef struct VoteResponse {
-	uint64 tag;
-	term_t term; /* not really needed, just adds observability */
+	AcceptorProposerMessage apm;
+	term_t term;
 	uint64 voteGiven;
-    /// Safekeeper's log position, to let proposer choose the most advanced one
-	term_t epoch;
+	/*
+	 * Safekeeper flush_lsn (end of WAL) + history of term switches allow
+     * proposer to choose the most advanced one.
+	 */
 	XLogRecPtr flushLsn;
 	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+	TermHistory termHistory;
 } VoteResponse;
 
+/*
+ * Proposer -> Acceptor message announcing proposer is elected and communicating
+ * epoch history to it.
+ */
+typedef struct ProposerElected
+{
+	uint64 tag;
+	term_t term;
+	/* proposer will send since this point */
+	XLogRecPtr startStreamingAt;
+	/* history of term switches up to this proposer */
+	TermHistory *termHistory;
+} ProposerElected;
+
 /*
  * Header of request with WAL message sent from proposer to walkeeper.
  */
@@ -289,7 +327,6 @@ typedef struct AppendResponse
 	 */
 	uint64 tag;
 	term_t     term;
-	term_t epoch;
 	XLogRecPtr flushLsn;
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
@@ -316,6 +353,7 @@ typedef struct WalKeeper
 	 * reach SS_SEND_WAL; not before.
 	 */
 	WalProposerConn*   conn;
+	StringInfoData outbuf;
 
 	WalMessage*        currMsg;       /* message been send to the receiver */
 
@@ -325,8 +363,7 @@ typedef struct WalKeeper
 	VoteResponse	   voteResponse;  /* the vote */
 	AppendResponse feedback;		  /* feedback to master */
 	/*
-	 * streaming must be started at the record boundary which is saved here, if
-	 * it differs from the chunk start
+	 * Streaming will start here; must be record boundary.
 	 */
 	XLogRecPtr startStreamingAt;
 } WalKeeper;
@@ -341,6 +378,10 @@ char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
 bool       HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32     pq_getmsgint32_le(StringInfo msg);
+uint64     pq_getmsgint64_le(StringInfo msg);
+void	   pq_sendint32_le(StringInfo buf, uint32 i);
+void	   pq_sendint64_le(StringInfo buf, uint64 i);
 void       WalProposerPoll(void);
 void       WalProposerRegister(void);
 void       ProcessStandbyReply(XLogRecPtr	writePtr,

From 7fb4e66a0d904d9eb5994b5680516612354a377b Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Wed, 10 Nov 2021 00:09:13 +0300
Subject: [PATCH 079/167] Clarify the meaning of StandbyReply LSNs used for
 backpressure

---
 src/backend/access/transam/xloginsert.c |  8 +++++---
 src/backend/replication/walproposer.c   | 10 ++++++++--
 src/backend/replication/walsender.c     |  6 +++++-
 src/include/replication/walproposer.h   |  3 ++-
 src/include/replication/walsender.h     |  2 +-
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a8263c2496a..e04a010ce37 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -152,12 +152,14 @@ XLogBeginInsert(void)
 		{
 			XLogRecPtr replicaWriteLsn;
 			XLogRecPtr replicaFlushLsn;
+			XLogRecPtr replicaApplyLsn;
 			XLogRecPtr myFlushLsn = GetFlushRecPtr();
 
-			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn);
+			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn, &replicaApplyLsn);
 
-			if ((replicaWriteLsn != UnknownXLogRecPtr
-				 && myFlushLsn > replicaWriteLsn + max_replication_write_lag*MB) ||
+			//TODO: rename max_replication_write_lag to max_replication_apply_lag ?
+			if ((replicaApplyLsn != UnknownXLogRecPtr
+				 && myFlushLsn > replicaApplyLsn + max_replication_write_lag*MB) ||
 				(replicaFlushLsn != UnknownXLogRecPtr
 				 && myFlushLsn > replicaFlushLsn + max_replication_flush_lag*MB))
 			{
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 3defed7f9ab..720c5138f8d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -395,9 +395,15 @@ HandleWalKeeperResponse(void)
 
 		/* advance the replication slot */
 		if (!syncSafekeepers)
-			ProcessStandbyReply(lastFeedback.diskConsistentLsn,
+			ProcessStandbyReply(
+								// write_lsn
+								// Not used, because we use SYNCHRONOUS_COMMIT_REMOTE_FLUSH.
 								lastFeedback.flushLsn,
-								InvalidXLogRecPtr, GetCurrentTimestamp(), false);
+								//flush_lsn - This is what durably stored in WAL service.
+								lastFeedback.flushLsn,
+								//apply_lsn - This is what processed and durably saved at pageserver.
+								lastFeedback.diskConsistentLsn,
+								GetCurrentTimestamp(), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback);
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 028406c4880..b2250bc515c 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3755,10 +3755,11 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
  * Get minimal write and flush LSN among all live replicas
  */
 void
-GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn)
+GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply_lsn)
 {
 	XLogRecPtr min_write_lsn = UnknownXLogRecPtr;
 	XLogRecPtr min_flush_lsn = UnknownXLogRecPtr;
+	XLogRecPtr min_apply_lsn = UnknownXLogRecPtr;
 	for (int i = 0; i < max_wal_senders; i++)
 	{
 		WalSnd	   *walsnd = &WalSndCtl->walsnds[i];
@@ -3771,11 +3772,14 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn)
 			 */
 			XLogRecPtr written = walsnd->write;
 			XLogRecPtr flushed = walsnd->flush;
+			XLogRecPtr applied = walsnd->apply;
 			min_write_lsn = Min(written, min_write_lsn);
 			min_flush_lsn = Min(flushed, min_flush_lsn);
+			min_apply_lsn = Min(applied, min_apply_lsn);
 		}
 	}
 	*write_lsn = min_write_lsn;
 	*flush_lsn = min_flush_lsn;
+	*apply_lsn = min_apply_lsn;
 }
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 30d8d72256c..59f4d73ed6d 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -327,11 +327,12 @@ typedef struct AppendResponse
 	 */
 	uint64 tag;
 	term_t     term;
+	// TODO: add comment
 	XLogRecPtr flushLsn;
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
 	XLogRecPtr commitLsn;
-	// Part of WALL applied and written to the disk by all pageservers
+	// Part of WAL applied and written to the disk by all pageservers
 	XLogRecPtr diskConsistentLsn;
 	HotStandbyFeedback hs;
 } AppendResponse;
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index bd2f9ad6d28..2ea0cbd69bc 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -48,7 +48,7 @@ extern void WalSndInitStopping(void);
 extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
-extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush);
+extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply);
 /*
  * Remember that we want to wakeup walsenders later
  *

From 3e0c8a3bdf23e19a58dd6fc6145c9f13ae2b3b50 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Thu, 11 Nov 2021 16:20:07 +0300
Subject: [PATCH 080/167] Use max_replication_apply_lag instead of
 max_replication_write_lag. Move backpressure throttling from XlogInsert, to
 ProcessInterrupts(), to restrict writing operations outside of critical
 section.

---
 src/backend/access/transam/xloginsert.c | 39 +------------------
 src/backend/storage/buffer/bufmgr.c     | 51 +++++++++++++++++++++++++
 src/backend/utils/misc/guc.c            |  8 ++--
 src/include/access/xloginsert.h         |  2 +-
 4 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index e04a010ce37..707889dd5d6 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -64,7 +64,7 @@ typedef struct
 } registered_buffer;
 
 /* GUCs */
-int			max_replication_write_lag;
+int			max_replication_apply_lag;
 int			max_replication_flush_lag;
 
 static registered_buffer *registered_buffers;
@@ -122,7 +122,6 @@ static bool XLogCompressBackupBlock(char *page, uint16 hole_offset,
 									uint16 hole_length, char *dest, uint16 *dlen);
 
 /* Timeout in milliseconds for delaying WAL inserts to avoid WAL overflow */
-#define BACK_PRESSURE_TIMEOUT 100
 #define MB ((XLogRecPtr)1024*1024)
 
 /*
@@ -143,42 +142,6 @@ XLogBeginInsert(void)
 	if (begininsert_called)
 		elog(ERROR, "XLogBeginInsert was already called");
 
-	if (max_replication_write_lag != 0 || max_replication_flush_lag != 0)
-	{
-		uint64 slept = 0;
-
-		/* Suspend writes until replicas catch up */
-		while (true)
-		{
-			XLogRecPtr replicaWriteLsn;
-			XLogRecPtr replicaFlushLsn;
-			XLogRecPtr replicaApplyLsn;
-			XLogRecPtr myFlushLsn = GetFlushRecPtr();
-
-			GetMinReplicaLsn(&replicaWriteLsn, &replicaFlushLsn, &replicaApplyLsn);
-
-			//TODO: rename max_replication_write_lag to max_replication_apply_lag ?
-			if ((replicaApplyLsn != UnknownXLogRecPtr
-				 && myFlushLsn > replicaApplyLsn + max_replication_write_lag*MB) ||
-				(replicaFlushLsn != UnknownXLogRecPtr
-				 && myFlushLsn > replicaFlushLsn + max_replication_flush_lag*MB))
-			{
-				(void) WaitLatch(MyLatch,
-								 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-								 BACK_PRESSURE_TIMEOUT,
-								 WAIT_EVENT_BACK_PRESSURE);
-				ResetLatch(MyLatch);
-				slept += BACK_PRESSURE_TIMEOUT;
-			}
-			else
-				break;
-		}
-
-		// XXX: INFO will cause a lot of regression tests to fail.
-		if (slept > 0)
-			elog(DEBUG1, "slept for " UINT64_FORMAT " ms while waiting for all replicas to catch up", slept);
-	}
-
 	begininsert_called = true;
 }
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index c3b054d1f29..190414d8718 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -54,6 +54,7 @@
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
+#include "replication/walsender.h"
 
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -4056,6 +4057,41 @@ UnlockBuffers(void)
 	}
 }
 
+// Check if we need to suspend inserts because of lagging replication.
+static uint64
+backpressureThrottle()
+{
+	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
+	{
+		XLogRecPtr writePtr;
+		XLogRecPtr flushPtr;
+		XLogRecPtr applyPtr;
+		XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
+		#define MB ((XLogRecPtr)1024*1024)
+
+		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
+			LSN_FORMAT_ARGS(myFlushLsn),
+			LSN_FORMAT_ARGS(writePtr),
+			LSN_FORMAT_ARGS(flushPtr),
+			LSN_FORMAT_ARGS(applyPtr));
+
+		if ((flushPtr != UnknownXLogRecPtr
+			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
+		{
+			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
+		}
+
+		if ((applyPtr != UnknownXLogRecPtr
+			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
+		{
+			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
+		}
+	}
+	return 0;
+}
+
 /*
  * Acquire or release the content_lock for the buffer.
  */
@@ -4075,7 +4111,22 @@ LockBuffer(Buffer buffer, int mode)
 	else if (mode == BUFFER_LOCK_SHARE)
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
+	{
+		// Suspend writes until replicas catch up
+		uint64 lag = backpressureThrottle();
+		while (lag > 0)
+		{
+			elog(DEBUG2, "BackpressureThrottle LockBuffer(LW_EXCLUSIVE): lag %lu", lag);
+			#define BACK_PRESSURE_TIMEOUT 10000L // 0.01 sec
+			pg_usleep(BACK_PRESSURE_TIMEOUT);
+			lag = backpressureThrottle();
+
+			// We can hang here for a while. Don't block cancel requests.
+			CHECK_FOR_INTERRUPTS();
+		}
+
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a12f0b88bbb..7c133e0b090 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2900,13 +2900,13 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+		{"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING,
 			gettext_noop("Maximal write lag between master and replicas."),
-			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
-						 "backends are blocked"),
+			gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value,"
+						 "backends are blocked."),
 			GUC_UNIT_MB,
 		},
-		&max_replication_write_lag,
+		&max_replication_apply_lag,
 		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 699ca56ed25..45dcaf99d9e 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -38,7 +38,7 @@
 #define REGBUF_KEEP_DATA	0x10	/* include data even if a full-page image
 									 * is taken */
 
-extern int max_replication_write_lag;
+extern int max_replication_apply_lag;
 extern int max_replication_flush_lag;
 
 /* prototypes for public functions in xloginsert.c: */

From 2c00fedf1449ac127353c97e2b476ef113c47507 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@zenith.tech>
Date: Thu, 25 Nov 2021 18:58:39 +0300
Subject: [PATCH 081/167] Forward pageserver connection string to safekeeper

This is needed for implementation of tenant rebalancing. With this
change safekeeper becomes aware of which pageserver is supposed to be
used for replication from this compute.

This also changes logic of substitution of auth token inside the
connection string. So it is substituted during config variable
parsing and available for both, smgr pageserver connection and
walproposer safekeeper connection.
---
 contrib/zenith/libpagestore.c         | 187 ++++++++++++++------------
 contrib/zenith/pagestore_smgr.c       |   2 +-
 src/backend/replication/walproposer.c |  36 +++--
 src/include/replication/walproposer.h |   1 +
 4 files changed, 128 insertions(+), 98 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 1dc708f0ad7..2caf5d74b6e 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -42,6 +42,8 @@ void		_PG_init(void);
 bool		connected = false;
 PGconn	   *pageserver_conn;
 
+char	   *page_server_connstring_raw;
+
 static ZenithResponse *zenith_call(ZenithRequest *request);
 page_server_api api = {
 	.request = zenith_call
@@ -52,93 +54,8 @@ zenith_connect()
 {
 	char	   *query;
 	int			ret;
-	char	   *auth_token;
-	char	   *err = NULL;
-	PQconninfoOption *conn_options;
-	PQconninfoOption *conn_option;
-	int			noptions = 0;
-
-	/* this is heavily inspired by psql/command.c::do_connect */
-	conn_options = PQconninfoParse(page_server_connstring, &err);
-
-	if (conn_options == NULL)
-	{
-		/* The error string is malloc'd, so we must free it explicitly */
-		char	   *errcopy = err ? pstrdup(err) : "out of memory";
 
-		PQfreemem(err);
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid connection string syntax: %s", errcopy)));
-	}
-
-	/*
-	 * Trying to populate pageserver connection string with auth token from
-	 * environment. We are looking for password in with placeholder value like
-	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
-	 * to fetch environment variable value and fail loudly if it is not set.
-	 */
-	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
-	{
-		noptions++;
-		if (strcmp(conn_option->keyword, "password") == 0)
-		{
-			if (conn_option->val != NULL && conn_option->val[0] != '\0')
-			{
-				/* ensure that this is a template */
-				if (strncmp(conn_option->val, "$", 1) != 0)
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
-
-				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
-				auth_token = getenv(&conn_option->val[1]);
-				if (!auth_token)
-				{
-					ereport(ERROR,
-							(errcode(ERRCODE_CONNECTION_EXCEPTION),
-							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
-				}
-				else
-				{
-					zenith_log(LOG, "using auth token from environment passed via env");
-
-					/*
-					 * inspired by PQconninfoFree and conninfo_storeval so
-					 * just free the old one and replace with freshly
-					 * malloc'ed one
-					 */
-					free(conn_option->val);
-					conn_option->val = strdup(auth_token);
-				}
-			}
-		}
-	}
-
-	/*
-	 * copy values from PQconninfoOption to key/value arrays because
-	 * PQconnectdbParams accepts options this way
-	 */
-	{
-		const char **keywords = malloc((noptions + 1) * sizeof(*keywords));
-		const char **values = malloc((noptions + 1) * sizeof(*values));
-		int			i = 0;
-
-		for (i = 0; i < noptions; i++)
-		{
-			keywords[i] = conn_options[i].keyword;
-			values[i] = conn_options[i].val;
-		}
-		/* add array terminator */
-		keywords[i] = NULL;
-		values[i] = NULL;
-
-		pageserver_conn = PQconnectdbParams(keywords, values, false);
-		free(keywords);
-		free(values);
-	}
-
-	PQconninfoFree(conn_options);
+	pageserver_conn = PQconnectdb(page_server_connstring);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -197,6 +114,7 @@ zenith_connect()
 		}
 	}
 
+	// FIXME: when auth is enabled this ptints JWT to logs
 	zenith_log(LOG, "libpqpagestore: connected to '%s'", page_server_connstring);
 
 	connected = true;
@@ -276,6 +194,96 @@ check_zenith_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(zid, *newval, 16);
 }
 
+static char *
+substitute_pageserver_password(const char *page_server_connstring_raw)
+{
+	char	   *host = NULL;
+	char	   *port = NULL;
+	char	   *user = NULL;
+	char	   *auth_token = NULL;
+	char	   *err = NULL;
+	char	   *page_server_connstring = NULL;
+	PQconninfoOption *conn_options;
+	PQconninfoOption *conn_option;
+	MemoryContext oldcontext;
+	/*
+	 * Here we substitute password in connection string with an environment variable.
+	 * To simplify things we construct a connection string back with only known options.
+	 * In particular: host port user and password. We do not currently use other options and
+	 * constructing full connstring in an URI shape is quite messy.
+	 */
+
+	if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
+		return NULL;
+
+	/* extract the auth token from the connection string */
+	conn_options = PQconninfoParse(page_server_connstring_raw, &err);
+	if (conn_options == NULL)
+	{
+		/* The error string is malloc'd, so we must free it explicitly */
+		char	   *errcopy = err ? pstrdup(err) : "out of memory";
+
+		PQfreemem(err);
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid connection string syntax: %s", errcopy)));
+	}
+
+	/*
+	 * Trying to populate pageserver connection string with auth token from
+	 * environment. We are looking for password in with placeholder value like
+	 * $ENV_VAR_NAME, so if password field is present and starts with $ we try
+	 * to fetch environment variable value and fail loudly if it is not set.
+	 */
+	for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
+	{
+		if (strcmp(conn_option->keyword, "host") == 0) {
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				host = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "port") == 0) {
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				port = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "user") == 0) {
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+				user = conn_option->val;
+		}
+		else if (strcmp(conn_option->keyword, "password") == 0)
+		{
+			if (conn_option->val != NULL && conn_option->val[0] != '\0')
+			{
+				/* ensure that this is a template */
+				if (strncmp(conn_option->val, "$", 1) != 0)
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
+
+				zenith_log(LOG, "found auth token placeholder in pageserver conn string %s", &conn_option->val[1]);
+				auth_token = getenv(&conn_option->val[1]);
+				if (!auth_token)
+				{
+					ereport(ERROR,
+							(errcode(ERRCODE_CONNECTION_EXCEPTION),
+							 errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
+				}
+				else
+				{
+					zenith_log(LOG, "using auth token from environment passed via env");
+				}
+			}
+		}
+	}
+	// allocate connection string in a TopMemoryContext to make sure it is not freed
+	oldcontext = CurrentMemoryContext;
+	MemoryContextSwitchTo(TopMemoryContext);
+	page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
+	MemoryContextSwitchTo(oldcontext);
+
+	PQconninfoFree(conn_options);
+	return page_server_connstring;
+}
+
 /*
  * Module initialization function
  */
@@ -285,7 +293,7 @@ _PG_init(void)
 	DefineCustomStringVariable("zenith.page_server_connstring",
 							   "connection string to the page server",
 							   NULL,
-							   &page_server_connstring,
+							   &page_server_connstring_raw,
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
@@ -335,9 +343,14 @@ _PG_init(void)
 	zenith_log(PqPageStoreTrace, "libpqpagestore already loaded");
 	page_server = &api;
 
+	/* substitute password in pageserver_connstring */
+	page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
+
 	/* Is there more correct way to pass CustomGUC to postgres code? */
 	zenith_timeline_walproposer = zenith_timeline;
 	zenith_tenant_walproposer = zenith_tenant;
+	/* Walproposer instructcs safekeeper which pageserver to use for replication */
+	zenith_pageserver_connstring_walproposer = page_server_connstring;
 
 	if (wal_redo)
 	{
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 25ad896491b..81aa2339779 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -86,7 +86,7 @@ const int	SmgrTrace = DEBUG5;
 page_server_api *page_server;
 
 /* GUCs */
-char	   *page_server_connstring;
+char	   *page_server_connstring; // with substituted password
 char	   *callmemaybe_connstring;
 char	   *zenith_timeline;
 char	   *zenith_tenant;
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 720c5138f8d..e0fbd653f92 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -283,9 +283,14 @@ ResetConnection(WalKeeper *wk)
 	 */
 	if (wk->conninfo[0] == '\0')
 	{
-		sprintf((char *) &wk->conninfo,
+		int written = 0;
+		written = snprintf((char *) &wk->conninfo, MAXCONNINFO,
 				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
 				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+		// currently connection string is not that long, but once we pass something like jwt we might overflow the buffer,
+		// so it is better to be defensive and check that everything aligns well
+		if (written > MAXCONNINFO || written < 0)
+			elog(FATAL, "could not create connection string for walkeeper %s:%s", wk->host, wk->port);
 	}
 
 	wk->conn = walprop_connect_start((char *) &wk->conninfo);
@@ -495,6 +500,7 @@ HandleWalKeeperResponse(void)
 
 char	   *zenith_timeline_walproposer = NULL;
 char	   *zenith_tenant_walproposer = NULL;
+char	   *zenith_pageserver_connstring_walproposer = NULL;
 
 
 static void
@@ -566,6 +572,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	if (*zenith_tenant_walproposer != '\0' &&
 		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+
 	proposerGreeting.timeline = ThisTimeLineID;
 	proposerGreeting.walSegSize = wal_segment_size;
 
@@ -1337,18 +1344,27 @@ AdvancePollState(int i, uint32 events)
 				 * sending, wait for response with SS_WAIT_EXEC_RESULT
 				 */
 			case SS_EXEC_STARTWALPUSH:
-				if (!walprop_send_query(wk->conn, "START_WAL_PUSH"))
 				{
-					elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-						 wk->host, wk->port, walprop_error_message(wk->conn));
-					ShutdownConnection(wk);
-					return;
+					char *query = NULL;
+					if (zenith_pageserver_connstring_walproposer != NULL) {
+						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
+					} else {
+						query = psprintf("START_WAL_PUSH");
+					}
+					if (!walprop_send_query(wk->conn, query))
+					{
+						pfree(query);
+						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+						ShutdownConnection(wk);
+						return;
+					}
+					pfree(query);
+					wk->state = SS_WAIT_EXEC_RESULT;
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+					break;
 				}
 
-				wk->state = SS_WAIT_EXEC_RESULT;
-				UpdateEventSet(wk, WL_SOCKET_READABLE);
-				break;
-
 			case SS_WAIT_EXEC_RESULT:
 				switch (walprop_get_query_result(wk->conn))
 				{
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 59f4d73ed6d..2b6d281ec2a 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -36,6 +36,7 @@ typedef struct WalMessage WalMessage;
 
 extern char *zenith_timeline_walproposer;
 extern char *zenith_tenant_walproposer;
+extern char	*zenith_pageserver_connstring_walproposer;
 
 /* Possible return values from ReadPGAsync */
 typedef enum

From 9e50f219696798baead71fe06d82d4c10f08cda8 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Fri, 26 Nov 2021 12:10:10 +0300
Subject: [PATCH 082/167] Move backpressure throttling to ProcessInterrupts()

---
 src/backend/access/transam/xloginsert.c |  5 +++
 src/backend/replication/walsender.c     | 34 +++++++++++++++++
 src/backend/storage/buffer/bufmgr.c     | 50 -------------------------
 src/backend/tcop/postgres.c             | 34 ++++++++++++++++-
 src/include/replication/walsender.h     |  1 +
 5 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 707889dd5d6..81bea0fb19e 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -460,6 +460,11 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
+	if (backpressure_lag() > 0)
+	{
+		InterruptPending = true;
+	}
+
 	do
 	{
 		XLogRecPtr	RedoRecPtr;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index b2250bc515c..2d54ef7fd8e 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3783,3 +3783,37 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply
 	*apply_lsn = min_apply_lsn;
 }
 
+// Check if we need to suspend inserts because of lagging replication.
+uint64
+backpressure_lag(void)
+{
+	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
+	{
+		XLogRecPtr writePtr;
+		XLogRecPtr flushPtr;
+		XLogRecPtr applyPtr;
+		XLogRecPtr myFlushLsn = GetFlushRecPtr();
+
+		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
+		#define MB ((XLogRecPtr)1024*1024)
+
+		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
+			LSN_FORMAT_ARGS(myFlushLsn),
+			LSN_FORMAT_ARGS(writePtr),
+			LSN_FORMAT_ARGS(flushPtr),
+			LSN_FORMAT_ARGS(applyPtr));
+
+		if ((flushPtr != UnknownXLogRecPtr
+			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
+		{
+			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
+		}
+
+		if ((applyPtr != UnknownXLogRecPtr
+			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
+		{
+			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
+		}
+	}
+	return 0;
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 190414d8718..a90f6432701 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4057,41 +4057,6 @@ UnlockBuffers(void)
 	}
 }
 
-// Check if we need to suspend inserts because of lagging replication.
-static uint64
-backpressureThrottle()
-{
-	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
-	{
-		XLogRecPtr writePtr;
-		XLogRecPtr flushPtr;
-		XLogRecPtr applyPtr;
-		XLogRecPtr myFlushLsn = GetFlushRecPtr();
-
-		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
-		#define MB ((XLogRecPtr)1024*1024)
-
-		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
-			LSN_FORMAT_ARGS(myFlushLsn),
-			LSN_FORMAT_ARGS(writePtr),
-			LSN_FORMAT_ARGS(flushPtr),
-			LSN_FORMAT_ARGS(applyPtr));
-
-		if ((flushPtr != UnknownXLogRecPtr
-			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
-		{
-			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
-		}
-
-		if ((applyPtr != UnknownXLogRecPtr
-			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
-		{
-			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
-		}
-	}
-	return 0;
-}
-
 /*
  * Acquire or release the content_lock for the buffer.
  */
@@ -4111,22 +4076,7 @@ LockBuffer(Buffer buffer, int mode)
 	else if (mode == BUFFER_LOCK_SHARE)
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-	{
-		// Suspend writes until replicas catch up
-		uint64 lag = backpressureThrottle();
-		while (lag > 0)
-		{
-			elog(DEBUG2, "BackpressureThrottle LockBuffer(LW_EXCLUSIVE): lag %lu", lag);
-			#define BACK_PRESSURE_TIMEOUT 10000L // 0.01 sec
-			pg_usleep(BACK_PRESSURE_TIMEOUT);
-			lag = backpressureThrottle();
-
-			// We can hang here for a while. Don't block cancel requests.
-			CHECK_FOR_INTERRUPTS();
-		}
-
 		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
-	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 171f3a95006..35523475281 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3125,8 +3125,8 @@ RecoveryConflictInterrupt(ProcSignalReason reason)
  * return; another interrupt could have arrived.  But we promise that
  * any pre-existing one will have been serviced.)
  */
-void
-ProcessInterrupts(void)
+static void
+ProcessInterrupts_pg(void)
 {
 	/* OK to accept any interrupts now? */
 	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
@@ -3368,6 +3368,36 @@ ProcessInterrupts(void)
 		ProcessLogMemoryContextInterrupt();
 }
 
+void
+ProcessInterrupts(void)
+{
+	uint64 lag;
+
+	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
+		return;
+
+	// Don't throttle read only transactions
+	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	{
+		ProcessInterrupts_pg();
+		return;
+	}
+
+	#define BACK_PRESSURE_DELAY 10000L // 0.01 sec
+	while(true)
+	{
+		ProcessInterrupts_pg();
+
+		// Suspend writers until replicas catch up
+		lag = backpressure_lag();
+		if (lag <= 0)
+			break;
+
+		elog(DEBUG2, "backpressure throttling: lag %lu", lag);
+		pg_usleep(BACK_PRESSURE_DELAY);
+	}
+}
+
 
 /*
  * IA64-specific code to fetch the AR.BSP register for stack depth checks.
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 2ea0cbd69bc..fe21617994a 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -49,6 +49,7 @@ extern void WalSndWaitStopping(void);
 extern void HandleWalSndInitStopping(void);
 extern void WalSndRqstFileReload(void);
 extern void GetMinReplicaLsn(XLogRecPtr* write, XLogRecPtr* flush, XLogRecPtr* apply);
+extern uint64 backpressure_lag(void);
 /*
  * Remember that we want to wakeup walsenders later
  *

From 41e38c0f6e2a0e01277582b61de238eeacfa134d Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 15 Dec 2021 16:10:03 +0300
Subject: [PATCH 083/167] Stop building docker images in this repo.

Now docker images are being built in zenith repo as that way we have
sequential version number that allows us to compare compute/storage
versions.
---
 .circleci/config.yml | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 16a271b0386..00000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-version: 2.1
-
-jobs:
-
-  # Build zenithdb/compute-node:latest image and push it to Docker hub
-  docker_image:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Build and push Docker image
-          command: |
-            echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
-            docker build -t zenithdb/compute-node:latest . && docker push zenithdb/compute-node:latest
-
-workflows:
-  version: 2
-  compute_node:
-    jobs:
-      # Build and push image only for commits to `main`.
-      - docker_image:
-          # Context gives an ability to login
-          context: Docker Hub
-          filters:
-            branches:
-              only:
-                - main

From 9bbc511ece0c699eb18fff1e831fa5f821f996b0 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 21 Dec 2021 16:51:22 +0300
Subject: [PATCH 084/167] [walproposer] Async WAL append (#105)

Implement async wp <-> sk protocol, send WAL messages ahead of feedback replies.

New SS_ACTIVE state is introduced instead of former SS_SEND_WAL / SS_SEND_WAL_FLUSH / SS_RECV_FEEDBACK.
---
 .../libpqwalproposer/libpqwalproposer.c       |   9 +-
 src/backend/replication/walproposer.c         | 357 +++++++++++-------
 src/backend/replication/walproposer_utils.c   |  15 +-
 src/include/replication/walproposer.h         |  38 +-
 4 files changed, 230 insertions(+), 189 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index f538ed9133f..177c93eb85d 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -22,7 +22,7 @@ static WalProposerConnectPollStatusType	libpqprop_connect_poll(WalProposerConn*
 static bool								libpqprop_send_query(WalProposerConn* conn, char* query);
 static WalProposerExecStatusType		libpqprop_get_query_result(WalProposerConn* conn);
 static pgsocket							libpqprop_socket(WalProposerConn* conn);
-static int								libpqprop_flush(WalProposerConn* conn, bool socket_read_ready);
+static int								libpqprop_flush(WalProposerConn* conn);
 static void								libpqprop_finish(WalProposerConn* conn);
 static PGAsyncReadResult				libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
 static PGAsyncWriteResult				libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
@@ -239,13 +239,8 @@ libpqprop_socket(WalProposerConn* conn)
 }
 
 static int
-libpqprop_flush(WalProposerConn* conn, bool socket_read_ready)
+libpqprop_flush(WalProposerConn* conn)
 {
-	/* If the socket is read-ready, we have to call PQconsumeInput before
-	 * calling PQflush (according to libpq docs) */
-	if (socket_read_ready && !PQconsumeInput(conn->pg_conn))
-		return -1; /* return failure if PQconsumeInput fails */
-
 	return (PQflush(conn->pg_conn));
 }
 
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index e0fbd653f92..99a77aba280 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -102,7 +102,7 @@ static bool AsyncReadFixed(int i, void *value, size_t value_size);
 static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
 static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
-static bool AsyncFlush(int i, bool socket_read_ready);
+static bool AsyncFlush(WalKeeper *wk);
 static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
@@ -110,6 +110,7 @@ static term_t GetHighestTerm(TermHistory *th);
 static term_t GetEpoch(WalKeeper *wk);
 static void SendProposerElected(WalKeeper *wk);
 static void StartStreaming(WalKeeper *wk);
+static bool SendAppendRequests(WalKeeper *wk);
 
 
 /*
@@ -236,8 +237,9 @@ HackyRemoveWalProposerEvent(WalKeeper *to_remove)
 			continue;
 
 		/* If this WAL keeper isn't offline, add an event for it! */
-		if ((desired_events = WalKeeperStateDesiredEvents(wk->state)))
+		if (wk->conn != NULL)
 		{
+			desired_events = WalKeeperStateDesiredEvents(wk->state);
 			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
 		}
 	}
@@ -251,7 +253,10 @@ ShutdownConnection(WalKeeper *wk)
 		walprop_finish(wk->conn);
 	wk->conn = NULL;
 	wk->state = SS_OFFLINE;
+	wk->flushWrite = false;
 	wk->currMsg = NULL;
+	wk->ackMsg = NULL;
+
 	if (wk->voteResponse.termHistory.entries)
 		pfree(wk->voteResponse.termHistory.entries);
 	wk->voteResponse.termHistory.entries = NULL;
@@ -546,7 +551,9 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 */
 		walkeeper[n_walkeepers].conninfo[0] = '\0';
 		initStringInfo(&walkeeper[n_walkeepers].outbuf);
+		walkeeper[n_walkeepers].flushWrite = false;
 		walkeeper[n_walkeepers].currMsg = NULL;
+		walkeeper[n_walkeepers].ackMsg = NULL;
 		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_walkeepers += 1;
 	}
@@ -699,7 +706,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 }
 
 /*
- * Send message to the particular node
+ * Start sending message to the particular node.
  *
  * Always updates the state and event set for the WAL keeper; setting either of
  * these before calling would be redundant work.
@@ -720,27 +727,11 @@ SendMessageToNode(int i, WalMessage *msg)
 		msg = msg->next;
 
 	wk->currMsg = msg;
+	wk->flushWrite = false;
 
-	/* Only try to send the message if it's non-null */
-	if (wk->currMsg)
-	{
-		wk->currMsg->req.commitLsn = GetAcknowledgedByQuorumWALPosition();
-		wk->currMsg->req.truncateLsn = truncateLsn;
-
-		/*
-		 * Once we've selected and set up our message, actually start sending
-		 * it.
-		 */
-		wk->state = SS_SEND_WAL;
-		/* Don't ned to update the event set; that's done by AdvancePollState */
-
-		AdvancePollState(i, WL_NO_EVENTS);
-	}
-	else
-	{
-		wk->state = SS_IDLE;
-		UpdateEventSet(wk, WL_SOCKET_READABLE);
-	}
+	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
+	if (!SendAppendRequests(wk))
+		return;
 }
 
 /*
@@ -751,7 +742,7 @@ BroadcastMessage(WalMessage *msg)
 {
 	for (int i = 0; i < n_walkeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE && walkeeper[i].currMsg == NULL)
+		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
 		{
 			SendMessageToNode(i, msg);
 		}
@@ -1154,13 +1145,20 @@ SendProposerElected(WalKeeper *wk)
 }
 
 /*
- * Start streaming to safekeeper wk.
+ * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
  */
 static void
 StartStreaming(WalKeeper *wk)
 {
 	int wki = wk - walkeeper;
 
+	/* 
+	 * This is the only entrypoint to state SS_ACTIVE. It's executed
+	 * exactly once for a connection.
+	 */
+	wk->state = SS_ACTIVE;
+	UpdateEventSet(wk, WL_SOCKET_READABLE);
+
 	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
 		if (msg->req.endLsn <= wk->startStreamingAt)
@@ -1174,7 +1172,6 @@ StartStreaming(WalKeeper *wk)
 			return;
 		}
 	}
-	wk->state = SS_IDLE; /* nothing to send yet, safekeeper is recovered */
 }
 
 /*
@@ -1233,13 +1230,184 @@ WalProposerPoll(void)
 	}
 }
 
+/*
+ * Send queue messages starting from wk->currMsg until the end or non-writable
+ * socket, whichever comes first.
+ * 
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ */
+static bool
+SendAppendRequests(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+	WalMessage *msg;
+	AppendRequestHeader *req;
+
+	if (wk->flushWrite)
+	{
+		if (!AsyncFlush(wk))
+			/* 
+			 * AsyncFlush failed, that could happen if the socket is closed or
+			 * we have nothing to write and should wait for writeable socket.
+			 */
+			return wk->state == SS_ACTIVE;
+
+		wk->currMsg = wk->currMsg->next;
+		wk->flushWrite = false;
+	}
+
+	while (wk->currMsg)
+	{
+		msg = wk->currMsg;
+		req = &msg->req;
+
+		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+		req->truncateLsn = truncateLsn;
+
+		Assert((msg->ackMask & (1 << wki)) == 0);
+
+		/*
+		 * If we need to send this message not from the beginning,
+		 * form the cut version. Only happens for the first
+		 * message.
+		 */
+		if (wk->startStreamingAt > msg->req.beginLsn)
+		{
+			uint32		len;
+			uint32		size;
+
+			Assert(wk->startStreamingAt < req->endLsn);
+
+			len = msg->req.endLsn - wk->startStreamingAt;
+			size = sizeof(AppendRequestHeader) + len;
+			req = malloc(size);
+			*req = msg->req;
+			req->beginLsn = wk->startStreamingAt;
+			memcpy(req + 1,
+					(char *) (&msg->req + 1) + wk->startStreamingAt -
+					msg->req.beginLsn,
+					len);
+		}
+
+		elog(LOG,
+				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				req->endLsn - req->beginLsn,
+				LSN_FORMAT_ARGS(req->beginLsn),
+				LSN_FORMAT_ARGS(req->endLsn),
+				LSN_FORMAT_ARGS(req->commitLsn),
+				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+
+		/* if this is the first sent message, we should start processing feedback */
+		if (wk->ackMsg == NULL)
+			wk->ackMsg = wk->currMsg;
+
+		/*
+		 * We write with msg->size here because the body of the
+		 * message is stored after the end of the WalMessage
+		 * struct, in the allocation for each msg
+		 */
+		if (!AsyncWrite(wk, req,
+						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
+						SS_ACTIVE))
+		{
+			if (req != &msg->req)
+				free(req);
+			if (wk->state == SS_ACTIVE)
+			{
+				wk->flushWrite = true;
+				return true;
+			}
+			return false;
+		}
+		if (req != &msg->req)
+			free(req);
+
+		/* continue writing the next message */
+		wk->currMsg = wk->currMsg->next;
+	}
+
+	return true;
+}
+
+/*
+ * Receive and process all available feedback.
+ *
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ * 
+ * NB: This function can call SendMessageToNode and produce new messages.
+ */
+static bool
+RecvAppendResponses(WalKeeper *wk)
+{
+	XLogRecPtr	minQuorumLsn;
+	int wki = wk - walkeeper;
+	bool readAnything = false;
+
+	while (true)
+	{
+		/*
+		 * If our reading doesn't immediately succeed, any
+		 * necessary error handling or state setting is taken care
+		 * of. We can leave any other work until later.
+		 */
+		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+			break;
+
+		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
+
+		/*
+		 * We shouldn't read responses ahead of wk->currMsg, because that will
+		 * look like we are receiving responses for messages that haven't been
+		 * sent yet. This can happen when message was placed in a buffer in 
+		 * SendAppendRequests, but sent through a wire only with a flush inside
+		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 */
+		if (wk->ackMsg == wk->currMsg)
+		{
+			/* Couldn't happen without flush flag */
+			Assert(wk->flushWrite);
+
+			wk->currMsg = wk->currMsg->next;
+			wk->flushWrite = false;
+		}
+
+		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
+											* receiving of this
+											* message */
+
+		wk->ackMsg = wk->ackMsg->next;
+		readAnything = true;
+	}
+
+	if (!readAnything)
+		return wk->state == SS_ACTIVE;
+
+	HandleWalKeeperResponse();
+
+	/*
+	 * Also send the new commit lsn to all the walkeepers.
+	 *
+	 * FIXME: This is redundant for walkeepers that have other
+	 * outbound messages pending.
+	 */
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastSentCommitLsn)
+	{
+		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+		lastSentCommitLsn = minQuorumLsn;
+	}
+
+	return wk->state == SS_ACTIVE;
+}
+
 /* Performs the logic for advancing the state machine of the 'i'th walkeeper,
  * given that a certain set of events has occured. */
 static void
 AdvancePollState(int i, uint32 events)
 {
 	WalKeeper  *wk = &walkeeper[i];
-
 	/*
 	 * Keep advancing the state while either: (a) the event is still
 	 * unprocessed (usually because it's the first iteration of the loop), or
@@ -1405,7 +1573,7 @@ AdvancePollState(int i, uint32 events)
 						 * generic "something went wrong"
 						 */
 					case WP_EXEC_UNEXPECTED_SUCCESS:
-						elog(WARNING, "Received bad resonse from walkeeper %s:%s query execution",
+						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
 							 wk->host, wk->port);
 						ShutdownConnection(wk);
 						return;
@@ -1607,6 +1775,12 @@ AdvancePollState(int i, uint32 events)
 							SendProposerElected(&walkeeper[i]);
 					}
 
+					/* 
+					 * The proposer has been elected, and there will be no quorum waiting
+					 * after this point. There will be no safekeeper with state SS_IDLE
+					 * also, because that state is used only for quorum waiting.
+					 */
+
 					if (syncSafekeepers)
 					{
 						/*
@@ -1636,7 +1810,7 @@ AdvancePollState(int i, uint32 events)
 				 * the flush completes. If we still have more to do, we'll
 				 * wait until the next poll comes along.
 				 */
-				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
+				if (!AsyncFlush(wk))
 					return;
 				
 				StartStreaming(wk);
@@ -1654,123 +1828,18 @@ AdvancePollState(int i, uint32 events)
 				ResetConnection(wk);
 				break;
 
-				/*
-				 * Start to send the message at wk->currMsg. Triggered only by
-				 * calls to SendMessageToNode
-				 */
-			case SS_SEND_WAL:
-				{
-					WalMessage *msg = wk->currMsg;
-					AppendRequestHeader *req = &msg->req;
 
-					/*
-					 * If we need to send this message not from the beginning,
-					 * form the cut version. Only happens for the first
-					 * message.
-					 */
-					if (wk->startStreamingAt > msg->req.beginLsn)
-					{
-						uint32		len;
-						uint32		size;
-
-						Assert(wk->startStreamingAt < req->endLsn);
-
-						len = msg->req.endLsn - wk->startStreamingAt;
-						size = sizeof(AppendRequestHeader) + len;
-						req = malloc(size);
-						*req = msg->req;
-						req->beginLsn = wk->startStreamingAt;
-						memcpy(req + 1,
-							   (char *) (&msg->req + 1) + wk->startStreamingAt -
-							   msg->req.beginLsn,
-							   len);
-					}
-
-					elog(LOG,
-						 "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						 req->endLsn - req->beginLsn,
-						 LSN_FORMAT_ARGS(req->beginLsn),
-						 LSN_FORMAT_ARGS(req->endLsn),
-						 LSN_FORMAT_ARGS(req->commitLsn),
-						 LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
-
-					/*
-					 * We write with msg->size here because the body of the
-					 * message is stored after the end of the WalMessage
-					 * struct, in the allocation for each msg
-					 */
-					if (!AsyncWrite(wk, req,
-									sizeof(AppendRequestHeader) + req->endLsn -
-									req->beginLsn,
-									SS_SEND_WAL_FLUSH))
-					{
-						if (req != &msg->req)
-							free(req);
+			case SS_ACTIVE:
+				if (events & WL_SOCKET_WRITEABLE)
+					if (!SendAppendRequests(wk))
 						return;
-					}
-					wk->state = SS_RECV_FEEDBACK;
-					if (req != &msg->req)
-						free(req);
-
-					break;
-				}
-
-				/* Flush the WAL message we're sending from SS_SEND_WAL */
-			case SS_SEND_WAL_FLUSH:
 
-				/*
-				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll
-				 * wait until the next poll comes along.
-				 */
-				if (!AsyncFlush(i, (events & WL_SOCKET_READABLE) != 0))
-					return;
-
-				wk->state = SS_RECV_FEEDBACK;
-
-				break;
-
-				/*
-				 * Start to receive the feedback from a message sent via
-				 * SS_SEND_WAL
-				 */
-			case SS_RECV_FEEDBACK:
-				{
-					WalMessage *next;
-					XLogRecPtr	minQuorumLsn;
-
-					/*
-					 * If our reading doesn't immediately succeed, any
-					 * necessary error handling or state setting is taken care
-					 * of. We can leave any other work until later.
-					 */
-					if (!AsyncReadFixed(i, &wk->feedback, sizeof(wk->feedback)))
+				if (events & WL_SOCKET_READABLE)
+					if (!RecvAppendResponses(wk))
 						return;
 
-					next = wk->currMsg->next;
-					wk->currMsg->ackMask |= 1 << i; /* this walkeeper confirms
-													 * receiving of this
-													 * message */
-
-					wk->currMsg = NULL;
-					HandleWalKeeperResponse();
-					SendMessageToNode(i, next); /* Updates state & event set */
-
-					/*
-					 * Also send the new commit lsn to all the walkeepers.
-					 *
-					 * FIXME: This is redundant for walkeepers that have other
-					 * outbound messages pending.
-					 */
-					minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-
-					if (minQuorumLsn > lastSentCommitLsn)
-					{
-						BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-						lastSentCommitLsn = minQuorumLsn;
-					}
-					break;
-				}
+				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
+				break;
 		}
 
 		/*
@@ -1983,17 +2052,15 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
  * If flushing successfully completes returns true, otherwise false.
  */
 static bool
-AsyncFlush(int i, bool socket_read_ready)
+AsyncFlush(WalKeeper *wk)
 {
-	WalKeeper  *wk = &walkeeper[i];
-
 	/*---
 	 * PQflush returns:
 	 *   0 if successful                    [we're good to move on]
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
 	 */
-	switch (walprop_flush(wk->conn, socket_read_ready))
+	switch (walprop_flush(wk->conn))
 	{
 		case 0:
 			UpdateEventSet(wk, WL_SOCKET_READABLE); /* flush is done, unset write interest */
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 924b8fb1eb7..c61ab87db45 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -75,14 +75,8 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_IDLE:
 			return_val = "idle";
 			break;
-		case SS_SEND_WAL:
-			return_val = "WAL-sending";
-			break;
-		case SS_SEND_WAL_FLUSH:
-			return_val = "WAL-sending (flushing)";
-			break;
-		case SS_RECV_FEEDBACK:
-			return_val = "WAL-feedback-receiving";
+		case SS_ACTIVE:
+			return_val = "active";
 			break;
 	}
 
@@ -143,7 +137,6 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 		case SS_WAIT_EXEC_RESULT:
 		case SS_HANDSHAKE_RECV:
 		case SS_WAIT_VERDICT:
-		case SS_RECV_FEEDBACK:
 			result = WL_SOCKET_READABLE;
 			break;
 
@@ -151,12 +144,12 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 		case SS_EXEC_STARTWALPUSH:
 		case SS_HANDSHAKE_SEND:
 		case SS_SEND_VOTE:
-		case SS_SEND_WAL:
 			result = WL_NO_EVENTS;
 			break;
 		/* but flushing does require read- or write-ready */
 		case SS_SEND_ELECTED_FLUSH:
-		case SS_SEND_WAL_FLUSH:
+		/* Active state does both reading and writing to the socket */
+		case SS_ACTIVE:
 			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
 			break;
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 2b6d281ec2a..ca27df2d19b 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -70,10 +70,7 @@ typedef enum
 /*
  * WAL safekeeper state
  *
- * States are listed here in the order that they're executed - with the only
- * exception occuring from the "send WAL" cycle, which loops as:
- *
- *   SS_IDLE -> SS_SEND_WAL (+ flush) -> SS_RECV_FEEDBACK -> SS_IDLE/SS_SEND_WAL
+ * States are listed here in the order that they're executed.
  *
  * Most states, upon failure, will move back to SS_OFFLINE by calls to
  * ResetConnection or ShutdownConnection.
@@ -156,28 +153,15 @@ typedef enum
 	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
 	 * read-ready, the connection has been closed.
 	 *
-	 * Moves to SS_SEND_WAL only by calls to SendMessageToNode.
+	 * Moves to SS_ACTIVE only by calls to SendMessageToNode.
 	 */
 	SS_IDLE,
+
 	/*
-	 * Start sending the message at currMsg. This state is only ever reached
-	 * through calls to SendMessageToNode.
-	 *
-	 * Sending needs to flush; immediately moves to SS_SEND_WAL_FLUSH.
-	 */
-	SS_SEND_WAL,
-	/*
-	 * Flush the WAL message, repeated until successful. On success, moves to
-	 * SS_RECV_FEEDBACK.
-	 */
-	SS_SEND_WAL_FLUSH,
-	/*
-	 * Currently reading feedback from sending the WAL.
-	 *
-	 * After reading, moves to (SS_SEND_WAL or SS_IDLE) by calls to
-	 * SendMessageToNode.
+	 * Active phase, when we acquired quorum and have WAL to send or feedback
+	 * to read.
 	 */
-	SS_RECV_FEEDBACK,
+	SS_ACTIVE,
 } WalKeeperState;
 
 /* Consensus logical timestamp. */
@@ -352,12 +336,14 @@ typedef struct WalKeeper
 	 * postgres protocol connection to the WAL acceptor
 	 *
 	 * Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
-	 * reach SS_SEND_WAL; not before.
+	 * reach SS_ACTIVE; not before.
 	 */
 	WalProposerConn*   conn;
 	StringInfoData outbuf;
 
+	bool               flushWrite;    /* set to true if we wrote currMsg, but still need to call AsyncFlush */
 	WalMessage*        currMsg;       /* message been send to the receiver */
+	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
 	WalKeeperState     state;         /* walkeeper state machine state */
@@ -470,7 +456,7 @@ typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerCon
 typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
 
 /* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-typedef int (*walprop_flush_fn) (WalProposerConn* conn, bool socket_read_ready);
+typedef int (*walprop_flush_fn) (WalProposerConn* conn);
 
 /* Re-exported PQfinish */
 typedef void (*walprop_finish_fn) (WalProposerConn* conn);
@@ -545,8 +531,8 @@ typedef struct WalProposerFunctionsType
 	WalProposerFunctions->walprop_set_nonblocking(conn, arg)
 #define walprop_socket(conn) \
 	WalProposerFunctions->walprop_socket(conn)
-#define walprop_flush(conn, consume_input) \
-	WalProposerFunctions->walprop_flush(conn, consume_input)
+#define walprop_flush(conn) \
+	WalProposerFunctions->walprop_flush(conn)
 #define walprop_finish(conn) \
 	WalProposerFunctions->walprop_finish(conn)
 #define walprop_async_read(conn, buf, amount) \

From 9ba395f174d1c9ada0f40a4800d3a09539ea6e5f Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Wed, 22 Dec 2021 18:47:38 +0300
Subject: [PATCH 085/167] Fix walsender to work with zenith style standbyReply
 that sends non-zero flushLsn. Clean up backpressure defaults.

---
 src/backend/replication/walproposer.c | 3 +--
 src/backend/replication/walsender.c   | 6 ++++--
 src/backend/utils/misc/guc.c          | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 99a77aba280..95b9c0ae32d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -406,8 +406,7 @@ HandleWalKeeperResponse(void)
 		/* advance the replication slot */
 		if (!syncSafekeepers)
 			ProcessStandbyReply(
-								// write_lsn
-								// Not used, because we use SYNCHRONOUS_COMMIT_REMOTE_FLUSH.
+								// write_lsn -  This is what durably stored in WAL service.
 								lastFeedback.flushLsn,
 								//flush_lsn - This is what durably stored in WAL service.
 								lastFeedback.flushLsn,
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 2d54ef7fd8e..9f3b0bf64c0 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2991,8 +2991,8 @@ WalSndDone(WalSndSendDataCallback send_data)
 	 * flush location if valid, write otherwise. Tools like pg_receivewal will
 	 * usually (unless in synchronous mode) return an invalid flush location.
 	 */
-	replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ?
-		MyWalSnd->write : MyWalSnd->flush;
+	// XXX Zenith uses flush_lsn to pass extra payload, so use write_lsn here
+	replicatedPtr = MyWalSnd->write;
 
 	if (WalSndCaughtUp && sentPtr == replicatedPtr &&
 		!pq_is_send_pending())
@@ -3804,12 +3804,14 @@ backpressure_lag(void)
 			LSN_FORMAT_ARGS(applyPtr));
 
 		if ((flushPtr != UnknownXLogRecPtr
+			&& max_replication_flush_lag > 0
 			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
 		{
 			return (myFlushLsn - flushPtr - max_replication_flush_lag*MB);
 		}
 
 		if ((applyPtr != UnknownXLogRecPtr
+			&& max_replication_apply_lag > 0
 			&& myFlushLsn > applyPtr + max_replication_apply_lag*MB))
 		{
 			return (myFlushLsn - applyPtr - max_replication_apply_lag*MB);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7c133e0b090..5d3b1e04fa4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2907,7 +2907,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_MB,
 		},
 		&max_replication_apply_lag,
-		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 
@@ -2919,7 +2919,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_MB,
 		},
 		&max_replication_flush_lag,
-		0, 0, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
 		NULL, NULL, NULL
 	},
 

From 6d88bd2a63c1c60adbca285cacf17da1afb4ca8c Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Sat, 25 Dec 2021 19:42:14 +0300
Subject: [PATCH 086/167] Do not copy the obsolete apply_conf binary into
 Docker image

---
 Dockerfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4878e3cc755..496228cabcd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Image with pre-built tools
 #
 FROM zenithdb/compute-tools:latest AS compute-deps
-# Only to get ready zenith_ctl and apply_conf binaries as deps
+# Only to get ready zenith_ctl binary as deppendency
 
 #
 # Image with Postgres build deps
@@ -56,7 +56,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
 
 # Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/apply_conf /usr/local/bin/apply_conf
 COPY --from=compute-deps /usr/local/bin/zenith_ctl /usr/local/bin/zenith_ctl
 
 # Add postgres shared objects to the search path

From 39d07f03963b1b75ac96ac017c7ab348f66a636c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 31 Dec 2021 12:57:46 +0300
Subject: [PATCH 087/167] Reorder walproposer code in a more natural order
 (#112)

Now functions in walproposer.c go in chronological order
---
 src/backend/replication/walproposer.c | 2945 +++++++++++++------------
 1 file changed, 1488 insertions(+), 1457 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 95b9c0ae32d..b307c79177d 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -60,6 +60,10 @@ char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
 bool		am_wal_proposer;
 
+char	   *zenith_timeline_walproposer = NULL;
+char	   *zenith_tenant_walproposer = NULL;
+char	   *zenith_pageserver_connstring_walproposer = NULL;
+
 /* Declared in walproposer.h, defined here, initialized in libpqwalproposer.c */
 WalProposerFunctionsType *WalProposerFunctions = NULL;
 
@@ -95,171 +99,412 @@ static TimestampTz last_reconnect_attempt;
 /* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
 static bool syncSafekeepers;
 
-/* Declarations of a few functions ahead of time, so that we can define them out of order. */
+/* Prototypes for private functions */
+static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
+static void WalProposerStart(void);
+static void WalProposerLoop(void);
+static void InitEventSet(void);
+static void UpdateEventSet(WalKeeper *wk, uint32 events);
+static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
+static void ShutdownConnection(WalKeeper *wk);
+static void ResetConnection(WalKeeper *wk);
+static long TimeToReconnect(TimestampTz now);
+static void ReconnectWalKeepers(void);
 static void AdvancePollState(int i, uint32 events);
+static term_t GetHighestTerm(TermHistory *th);
+static term_t GetEpoch(WalKeeper *wk);
+static void DetermineEpochStartLsn(void);
+static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+static void SendProposerElected(WalKeeper *wk);
+static void WalProposerStartStreaming(XLogRecPtr startpos);
+static void StartStreaming(WalKeeper *wk);
+static void SendMessageToNode(int i, WalMessage *msg);
+static void BroadcastMessage(WalMessage *msg);
+static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
+static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static bool SendAppendRequests(WalKeeper *wk);
+static bool RecvAppendResponses(WalKeeper *wk);
+static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
+static XLogRecPtr CalculateDiskConsistentLsn(void);
+static XLogRecPtr CalculateMinFlushLsn(void);
+static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
+static void HandleWalKeeperResponse(void);
 static bool AsyncRead(int i, char **buf, int *buf_size);
 static bool AsyncReadFixed(int i, void *value, size_t value_size);
 static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
 static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
 static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
 static bool AsyncFlush(WalKeeper *wk);
-static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
-static void BroadcastMessage(WalMessage *msg);
-static WalMessage *CreateMessageCommitLsnOnly(XLogRecPtr lsn);
-static term_t GetHighestTerm(TermHistory *th);
-static term_t GetEpoch(WalKeeper *wk);
-static void SendProposerElected(WalKeeper *wk);
-static void StartStreaming(WalKeeper *wk);
-static bool SendAppendRequests(WalKeeper *wk);
-
 
 /*
- * Combine hot standby feedbacks from all walkeepers.
+ * WAL proposer bgworker entry point.
  */
-static void
-CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
+void
+WalProposerMain(Datum main_arg)
 {
-	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
 
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].feedback.hs.ts != 0)
-		{
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
-			{
-				hs->xmin = walkeeper[i].feedback.hs.xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
-			}
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
-			{
-				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
-			}
-		}
-	}
-}
+	BackgroundWorkerUnblockSignals();
 
-/*
- * Get minimum of disk consistent LSNs of all safekeepers
- */
-static XLogRecPtr
-CalculateDiskConsistentLsn(void)
-{
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
+	GetXLogReplayRecPtr(&ThisTimeLineID);
+
+	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
+
+	last_reconnect_attempt = GetCurrentTimestamp();
+
+	application_name = (char *) "walproposer";	/* for
+												 * synchronous_standby_names */
+	am_wal_proposer = true;
+	am_walsender = true;
+	InitWalSender();
+
+	/* Create replication slot for WAL proposer if not exists */
+	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
 	{
-		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
-		{
-			lsn = walkeeper[i].feedback.diskConsistentLsn;
-		}
+		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
+		ReplicationSlotReserveWal();
+		/* Write this slot to disk */
+		ReplicationSlotMarkDirty();
+		ReplicationSlotSave();
+		ReplicationSlotRelease();
 	}
-	return lsn;
+
+	WalProposerStart();
 }
 
 /*
- * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
- * last WAL record that can be safely discarded.
+ * Entry point for `postgres --sync-safekeepers`.
  */
-static XLogRecPtr
-CalculateMinFlushLsn(void)
+void
+WalProposerSync(int argc, char *argv[])
 {
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].feedback.flushLsn < lsn)
-			lsn = walkeeper[i].feedback.flushLsn;
-	}
-	return lsn;
-}
+	syncSafekeepers = true;
 
-/* Initializes the internal event set, provided that it is currently null */
-static void
-InitEventSet(void)
-{
-	if (waitEvents)
-		elog(FATAL, "double-initialization of event set");
+	InitStandaloneProcess(argv[0]);
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
-	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set default values for command-line options.
+	 */
+	InitializeGUCOptions();
+
+	/* Acquire configuration parameters */
+	if (!SelectConfigFiles(NULL, progname))
+		exit(1);
+
+	/*
+	 * Imitate we are early in bootstrap loading shared_preload_libraries;
+	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
+	 */
+	process_shared_preload_libraries_in_progress = true;
+
+	/*
+	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
+	 *
+	 * Copied from InitPostmasterDeathWatchHandle()
+	 */
+	if (pipe(postmaster_alive_fds) < 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
+
+	WalProposerInit(0, 0);
+
+	process_shared_preload_libraries_in_progress = false;
+
+	BackgroundWorkerUnblockSignals();
+
+	WalProposerStart();
 }
 
 /*
- * Updates the events we're already waiting on for the WAL keeper, setting it to
- * the provided `events`
- *
- * This function is called any time the WAL keeper's state switches to one where
- * it has to wait to continue. This includes the full body of AdvancePollState
- * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
+ * Create new AppendRequest message and start sending it. This function is
+ * called from walsender every time the new WAL is available.
  */
-static void
-UpdateEventSet(WalKeeper *wk, uint32 events)
+void
+WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
 {
-	/* eventPos = -1 when we don't have an event */
-	Assert(wk->eventPos != -1);
+	WalMessage *msg = CreateMessage(startpos, data, len);
 
-	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+	if (msg != NULL)
+		BroadcastMessage(msg);
 }
 
-/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
- *
- * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+/*
+ * Advance the WAL proposer state machine, waiting each time for events to occur.
+ * Will exit only when latch is set, i.e. new WAL should be pushed from walsender
+ * to walproposer.
  */
-static void
-HackyRemoveWalProposerEvent(WalKeeper *to_remove)
+void
+WalProposerPoll(void)
 {
-	/* Remove the existing event set */
-	if (waitEvents)
+	while (true)
 	{
-		FreeWaitEventSet(waitEvents);
-		waitEvents = NULL;
-	}
-	/* Re-initialize it without adding any walkeeper events */
-	InitEventSet();
+		WalKeeper  *wk;
+		int			rc;
+		int			i;
+		WaitEvent	event;
+		TimestampTz now = GetCurrentTimestamp();
 
-	/*
-	 * loop through the existing walkeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		uint32		desired_events = WL_NO_EVENTS;
-		WalKeeper  *wk = &walkeeper[i];
+		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
+							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
+		wk = (WalKeeper *) event.user_data;
+		i = (int) (wk - walkeeper);
 
-		wk->eventPos = -1;
+		/*
+		 * If the event contains something that one of our walkeeper states
+		 * was waiting for, we'll advance its state.
+		 */
+		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
+			AdvancePollState(i, event.events);
 
-		if (wk == to_remove)
-			continue;
+		/*
+		 * If the timeout expired, attempt to reconnect to any walkeepers that
+		 * we dropped
+		 */
+		ReconnectWalKeepers();
 
-		/* If this WAL keeper isn't offline, add an event for it! */
-		if (wk->conn != NULL)
+		/*
+		 * If wait is terminated by latch set (walsenders' latch is set on
+		 * each wal flush), then exit loop. (no need for pm death check due to
+		 * WL_EXIT_ON_PM_DEATH)
+		 */
+		if (rc != 0 && (event.events & WL_LATCH_SET))
 		{
-			desired_events = WalKeeperStateDesiredEvents(wk->state);
-			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
+			ResetLatch(MyLatch);
+			break;
+		}
+		if (rc == 0) /* timeout expired: poll state */
+		{
+			/*
+			 * If no WAL was generated during timeout (and we have already
+			 * collected the quorum), then send pool message
+			 */
+			if (lastSentLsn != InvalidXLogRecPtr)
+			{
+				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+			}
 		}
 	}
 }
 
-/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
-static void
-ShutdownConnection(WalKeeper *wk)
+/*
+ * Register a background worker proposing WAL to wal acceptors.
+ */
+void
+WalProposerRegister(void)
 {
-	if (wk->conn)
-		walprop_finish(wk->conn);
-	wk->conn = NULL;
-	wk->state = SS_OFFLINE;
-	wk->flushWrite = false;
-	wk->currMsg = NULL;
-	wk->ackMsg = NULL;
+	BackgroundWorker bgw;
 
-	if (wk->voteResponse.termHistory.entries)
-		pfree(wk->voteResponse.termHistory.entries);
-	wk->voteResponse.termHistory.entries = NULL;
+	if (*wal_acceptors_list == '\0')
+		return;
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+static void
+WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
+{
+	char	   *host;
+	char	   *sep;
+	char	   *port;
+
+	/* Load the libpq-specific functions */
+	load_file("libpqwalproposer", false);
+	if (WalProposerFunctions == NULL)
+		elog(ERROR, "libpqwalproposer didn't initialize correctly");
+
+	load_file("libpqwalreceiver", false);
+	if (WalReceiverFunctions == NULL)
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+	load_file("zenith", false);
+
+	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
+	{
+		port = strchr(host, ':');
+		if (port == NULL)
+		{
+			elog(FATAL, "port is not specified");
+		}
+		*port++ = '\0';
+		sep = strchr(port, ',');
+		if (sep != NULL)
+			*sep++ = '\0';
+		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
+		{
+			elog(FATAL, "Too many walkeepers");
+		}
+		walkeeper[n_walkeepers].host = host;
+		walkeeper[n_walkeepers].port = port;
+		walkeeper[n_walkeepers].state = SS_OFFLINE;
+		walkeeper[n_walkeepers].conn = NULL;
+
+		/*
+		 * Set conninfo to empty. We'll fill it out once later, in
+		 * `ResetConnection` as needed
+		 */
+		walkeeper[n_walkeepers].conninfo[0] = '\0';
+		initStringInfo(&walkeeper[n_walkeepers].outbuf);
+		walkeeper[n_walkeepers].flushWrite = false;
+		walkeeper[n_walkeepers].currMsg = NULL;
+		walkeeper[n_walkeepers].ackMsg = NULL;
+		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
+		n_walkeepers += 1;
+	}
+	if (n_walkeepers < 1)
+	{
+		elog(FATAL, "WalKeepers addresses are not specified");
+	}
+	quorum = n_walkeepers / 2 + 1;
+
+	/* Fill the greeting package */
+	proposerGreeting.tag = 'g';
+	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
+	proposerGreeting.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
+	proposerGreeting.systemId = systemId;
+	if (!zenith_timeline_walproposer)
+		elog(FATAL, "zenith.zenith_timeline is not provided");
+	if (*zenith_timeline_walproposer != '\0' &&
+		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+	if (!zenith_tenant_walproposer)
+		elog(FATAL, "zenith.zenith_tenant is not provided");
+	if (*zenith_tenant_walproposer != '\0' &&
+		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
+		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+
+	proposerGreeting.timeline = ThisTimeLineID;
+	proposerGreeting.walSegSize = wal_segment_size;
+
+	InitEventSet();
+}
+
+static void
+WalProposerStart(void)
+{
+
+	/* Initiate connections to all walkeeper nodes */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		ResetConnection(&walkeeper[i]);
+	}
+
+	WalProposerLoop();
+}
+
+static void
+WalProposerLoop(void)
+{
+	while (true)
+		WalProposerPoll();
+}
+
+/* Initializes the internal event set, provided that it is currently null */
+static void
+InitEventSet(void)
+{
+	if (waitEvents)
+		elog(FATAL, "double-initialization of event set");
+
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
+	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+}
+
+/*
+ * Updates the events we're already waiting on for the WAL keeper, setting it to
+ * the provided `events`
+ *
+ * This function is called any time the WAL keeper's state switches to one where
+ * it has to wait to continue. This includes the full body of AdvancePollState
+ * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
+ */
+static void
+UpdateEventSet(WalKeeper *wk, uint32 events)
+{
+	/* eventPos = -1 when we don't have an event */
+	Assert(wk->eventPos != -1);
+
+	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+}
+
+/* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
+ */
+static void
+HackyRemoveWalProposerEvent(WalKeeper *to_remove)
+{
+	/* Remove the existing event set */
+	if (waitEvents)
+	{
+		FreeWaitEventSet(waitEvents);
+		waitEvents = NULL;
+	}
+	/* Re-initialize it without adding any walkeeper events */
+	InitEventSet();
+
+	/*
+	 * loop through the existing walkeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		uint32		desired_events = WL_NO_EVENTS;
+		WalKeeper  *wk = &walkeeper[i];
+
+		wk->eventPos = -1;
+
+		if (wk == to_remove)
+			continue;
+
+		/* If this WAL keeper isn't offline, add an event for it! */
+		if (wk->conn != NULL)
+		{
+			desired_events = WalKeeperStateDesiredEvents(wk->state);
+			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
+		}
+	}
+}
+
+/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
+static void
+ShutdownConnection(WalKeeper *wk)
+{
+	if (wk->conn)
+		walprop_finish(wk->conn);
+	wk->conn = NULL;
+	wk->state = SS_OFFLINE;
+	wk->flushWrite = false;
+	wk->currMsg = NULL;
+	wk->ackMsg = NULL;
+
+	if (wk->voteResponse.termHistory.entries)
+		pfree(wk->voteResponse.termHistory.entries);
+	wk->voteResponse.termHistory.entries = NULL;
 
 	HackyRemoveWalProposerEvent(wk);
 }
@@ -357,1495 +602,1307 @@ ResetConnection(WalKeeper *wk)
 }
 
 /*
- * Calculate WAL position acknowledged by quorum
+ * How much milliseconds left till we should attempt reconnection to
+ * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
+ * (do we actually need this?).
  */
-static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(void)
+static long
+TimeToReconnect(TimestampTz now)
 {
-	XLogRecPtr	responses[MAX_WALKEEPERS];
+	TimestampTz passed;
+	TimestampTz till_reconnect;
 
-	/*
-	 * Sort acknowledged LSNs
-	 */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		/*
-		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
-		 */
-		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
-			walkeeper[i].feedback.flushLsn : 0;
-	}
-	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
+	if (wal_acceptor_reconnect_timeout <= 0)
+		return -1;
 
-	/*
-	 * Get the smallest LSN committed by quorum
-	 */
-	return responses[n_walkeepers - quorum];
+	passed = now - last_reconnect_attempt;
+	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
+	if (till_reconnect <= 0)
+		return 0;
+	return (long) (till_reconnect / 1000);
 }
 
+/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
 static void
-HandleWalKeeperResponse(void)
+ReconnectWalKeepers(void)
 {
-	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	minQuorumLsn;
-	XLogRecPtr	diskConsistentLsn;
-	XLogRecPtr  minFlushLsn;
-
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = CalculateDiskConsistentLsn();
+	TimestampTz now = GetCurrentTimestamp();
 
-	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
+	if (TimeToReconnect(now) == 0)
 	{
-
-		if (minQuorumLsn > lastFeedback.flushLsn)
-			lastFeedback.flushLsn = minQuorumLsn;
-
-		lastFeedback.diskConsistentLsn = diskConsistentLsn;
-
-		/* advance the replication slot */
-		if (!syncSafekeepers)
-			ProcessStandbyReply(
-								// write_lsn -  This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
-								//flush_lsn - This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
-								//apply_lsn - This is what processed and durably saved at pageserver.
-								lastFeedback.diskConsistentLsn,
-								GetCurrentTimestamp(), false);
-	}
-
-	CombineHotStanbyFeedbacks(&hsFeedback);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
-	{
-		lastFeedback.hs = hsFeedback;
-		if (!syncSafekeepers)
-			ProcessStandbyHSFeedback(hsFeedback.ts,
-									 XidFromFullTransactionId(hsFeedback.xmin),
-									 EpochFromFullTransactionId(hsFeedback.xmin),
-									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
-	}
-
-	/*
-	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the 
-	 * beginning of the record, which simplifies decoding on the far end.
-	 *
-	 * Advanced truncateLsn should be not further than nearest commitLsn.
-	 * This prevents surprising violation of truncateLsn <= commitLsn
-	 * invariant which might occur because 1) truncateLsn can be advanced
-	 * immediately once chunk is broadcast to all safekeepers, and
-	 * commitLsn generally can't be advanced based on feedback from
-	 * safekeeper who is still in the previous epoch (similar to 'leader
-	 * can't commit entries from previous term' in Raft); 2) chunks we
-	 * read from WAL and send are plain sheets of bytes, but safekeepers
-	 * ack only on record boundaries.
-	 */
-	minFlushLsn = CalculateMinFlushLsn();
-	if (minFlushLsn > truncateLsn)
-		truncateLsn = minFlushLsn;
-
-	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
-	{
-		WalMessage *msg = msgQueueHead;
-		msgQueueHead = msg->next;
-
-		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
-		free(msg);
-	}
-	if (!msgQueueHead)			/* queue is empty */
-		msgQueueTail = NULL;
-	/* truncateLsn always points to the first chunk in the queue */
-	if (msgQueueHead)
-	{
-		/* Max takes care of special 0-sized messages */
-		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
-			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
-	}
-
-	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
-	 * basically the random one gets connected, to prevent hanging basebackup
-	 * (due to pageserver connecting to not-synced-walkeeper) we currently
-	 * wait for all seemingly alive walkeepers to get synced.
-	 */
-	if (syncSafekeepers)
-	{
-		int			n_synced;
-
-		n_synced = 0;
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			WalKeeper  *wk = &walkeeper[i];
-			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
-
-			/* alive safekeeper which is not synced yet; wait for it */
-			if (wk->state != SS_OFFLINE && !synced)
-				return;
-			if (synced)
-				n_synced++;
-		}
-		if (n_synced >= quorum)
-		{
-			/* All walkeepers synced! */
-			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-			exit(0);
-		}
-	}
-}
-
-char	   *zenith_timeline_walproposer = NULL;
-char	   *zenith_tenant_walproposer = NULL;
-char	   *zenith_pageserver_connstring_walproposer = NULL;
-
-
-static void
-WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
-{
-	char	   *host;
-	char	   *sep;
-	char	   *port;
-
-	/* Load the libpq-specific functions */
-	load_file("libpqwalproposer", false);
-	if (WalProposerFunctions == NULL)
-		elog(ERROR, "libpqwalproposer didn't initialize correctly");
-
-	load_file("libpqwalreceiver", false);
-	if (WalReceiverFunctions == NULL)
-		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-	load_file("zenith", false);
-
-	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
-	{
-		port = strchr(host, ':');
-		if (port == NULL)
-		{
-			elog(FATAL, "port is not specified");
-		}
-		*port++ = '\0';
-		sep = strchr(port, ',');
-		if (sep != NULL)
-			*sep++ = '\0';
-		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
-		{
-			elog(FATAL, "Too many walkeepers");
-		}
-		walkeeper[n_walkeepers].host = host;
-		walkeeper[n_walkeepers].port = port;
-		walkeeper[n_walkeepers].state = SS_OFFLINE;
-		walkeeper[n_walkeepers].conn = NULL;
-
-		/*
-		 * Set conninfo to empty. We'll fill it out once later, in
-		 * `ResetConnection` as needed
-		 */
-		walkeeper[n_walkeepers].conninfo[0] = '\0';
-		initStringInfo(&walkeeper[n_walkeepers].outbuf);
-		walkeeper[n_walkeepers].flushWrite = false;
-		walkeeper[n_walkeepers].currMsg = NULL;
-		walkeeper[n_walkeepers].ackMsg = NULL;
-		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
-		n_walkeepers += 1;
-	}
-	if (n_walkeepers < 1)
-	{
-		elog(FATAL, "WalKeepers addresses are not specified");
-	}
-	quorum = n_walkeepers / 2 + 1;
-
-	/* Fill the greeting package */
-	proposerGreeting.tag = 'g';
-	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
-	proposerGreeting.pgVersion = PG_VERSION_NUM;
-	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
-	proposerGreeting.systemId = systemId;
-	if (!zenith_timeline_walproposer)
-		elog(FATAL, "zenith.zenith_timeline is not provided");
-	if (*zenith_timeline_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
-	if (!zenith_tenant_walproposer)
-		elog(FATAL, "zenith.zenith_tenant is not provided");
-	if (*zenith_tenant_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
-
-	proposerGreeting.timeline = ThisTimeLineID;
-	proposerGreeting.walSegSize = wal_segment_size;
-
-	InitEventSet();
-}
-
-static void
-WalProposerLoop(void)
-{
-	while (true)
-		WalProposerPoll();
-}
-
-static void
-WalProposerStart(void)
-{
-
-	/* Initiate connections to all walkeeper nodes */
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		ResetConnection(&walkeeper[i]);
-	}
-
-	WalProposerLoop();
-}
-
-/*
- * WAL proposer bgworeker entry point
- */
-void
-WalProposerMain(Datum main_arg)
-{
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	GetXLogReplayRecPtr(&ThisTimeLineID);
-
-	WalProposerInit(GetFlushRecPtr(), GetSystemIdentifier());
-
-	last_reconnect_attempt = GetCurrentTimestamp();
-
-	application_name = (char *) "walproposer";	/* for
-												 * synchronous_standby_names */
-	am_wal_proposer = true;
-	am_walsender = true;
-	InitWalSender();
-
-	/* Create replication slot for WAL proposer if not exists */
-	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)
-	{
-		ReplicationSlotCreate(WAL_PROPOSER_SLOT_NAME, false, RS_PERSISTENT, false);
-		ReplicationSlotReserveWal();
-		/* Write this slot to disk */
-		ReplicationSlotMarkDirty();
-		ReplicationSlotSave();
-		ReplicationSlotRelease();
-	}
-
-	WalProposerStart();
-}
-
-void
-WalProposerSync(int argc, char *argv[])
-{
-	syncSafekeepers = true;
-
-	InitStandaloneProcess(argv[0]);
-
-	SetProcessingMode(InitProcessing);
-
-	/*
-	 * Set default values for command-line options.
-	 */
-	InitializeGUCOptions();
-
-	/* Acquire configuration parameters */
-	if (!SelectConfigFiles(NULL, progname))
-		exit(1);
-
-	/*
-	 * Imitate we are early in bootstrap loading shared_preload_libraries;
-	 * zenith extension sets PGC_POSTMASTER gucs requiring this.
-	 */
-	process_shared_preload_libraries_in_progress = true;
-
-	/*
-	 * Initialize postmaster_alive_fds as WaitEventSet checks them.
-	 *
-	 * Copied from InitPostmasterDeathWatchHandle()
-	 */
-	if (pipe(postmaster_alive_fds) < 0)
-		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg_internal("could not create pipe to monitor postmaster death: %m")));
-	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
-		ereport(FATAL,
-				(errcode_for_socket_access(),
-				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
-
-	WalProposerInit(0, 0);
-
-	process_shared_preload_libraries_in_progress = false;
-
-	BackgroundWorkerUnblockSignals();
-
-	WalProposerStart();
-}
-
-static void
-WalProposerStartStreaming(XLogRecPtr startpos)
-{
-	StartReplicationCmd cmd;
-
-	elog(LOG, "WAL proposer starts streaming at %X/%X",
-		 LSN_FORMAT_ARGS(startpos));
-	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = proposerGreeting.timeline;
-	cmd.startpoint = startpos;
-	StartReplication(&cmd);
-}
+		last_reconnect_attempt = now;
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			if (walkeeper[i].state == SS_OFFLINE)
+				ResetConnection(&walkeeper[i]);
+		}
+	}
+}
 
 /*
- * Start sending message to the particular node.
- *
- * Always updates the state and event set for the WAL keeper; setting either of
- * these before calling would be redundant work.
+ * Performs the logic for advancing the state machine of the 'i'th walkeeper,
+ * given that a certain set of events has occured.
  */
 static void
-SendMessageToNode(int i, WalMessage *msg)
+AdvancePollState(int i, uint32 events)
 {
 	WalKeeper  *wk = &walkeeper[i];
-
-	/* we shouldn't be already sending something */
-	Assert(wk->currMsg == NULL);
-
 	/*
-	 * Skip already acknowledged messages. Used after reconnection to get to
-	 * the first not yet sent message. Otherwise we always just send 'msg'.
+	 * Keep advancing the state while either: (a) the event is still
+	 * unprocessed (usually because it's the first iteration of the loop), or
+	 * (b) the state can execute, and does not need to wait for any socket
+	 * events
 	 */
-	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
-		msg = msg->next;
+	while (events || StateShouldImmediatelyExecute(wk->state))
+	{
+		/*
+		 * Sanity check. We assume further down that the operations don't
+		 * block because the socket is ready.
+		 */
+		AssertEventsOkForState(events, wk);
 
-	wk->currMsg = msg;
-	wk->flushWrite = false;
+		/* Execute the code corresponding to the current state */
+		switch (wk->state)
+		{
+				/*
+				 * WAL keepers are only taken out of SS_OFFLINE by calls to
+				 * ResetConnection
+				 */
+			case SS_OFFLINE:
+				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
+					 wk->host, wk->port);
+				break;			/* actually unreachable, but prevents
+								 * -Wimplicit-fallthrough */
+
+				/*
+				 * Both connecting states run the same logic. The only
+				 * difference is the events they're expecting
+				 */
+			case SS_CONNECTING_READ:
+			case SS_CONNECTING_WRITE:
+				{
+					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
 
-	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	if (!SendAppendRequests(wk))
-		return;
-}
+					/* The new set of events we'll wait on, after updating */
+					uint32		new_events = WL_NO_EVENTS;
 
-/*
- * Broadcast new message to all caught-up walkeepers
- */
-static void
-BroadcastMessage(WalMessage *msg)
-{
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
-		{
-			SendMessageToNode(i, msg);
-		}
-	}
-}
+					switch (result)
+					{
+						case WP_CONN_POLLING_OK:
+							elog(LOG, "connected with node %s:%s", wk->host,
+								 wk->port);
 
-static WalMessage *
-CreateMessage(XLogRecPtr startpos, char *data, int len)
-{
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
-	XLogRecPtr	endpos;
+							/*
+							 * Once we're fully connected, we can move to the
+							 * next state
+							 */
+							wk->state = SS_EXEC_STARTWALPUSH;
 
-	len -= XLOG_HDR_SIZE;
-	endpos = startpos + len;
-	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
-	{
-		/* Message already queued */
-		return NULL;
-	}
-	Assert(len >= 0);
-	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
+							/*
+							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
+							 * on anything, we do need to replace the current
+							 * event, so we have to just pick something. We'll
+							 * eventually need the socket to be readable, so
+							 * we go with that.
+							 */
+							new_events = WL_SOCKET_READABLE;
+							break;
 
-	msg->size = sizeof(AppendRequestHeader) + len;
-	msg->next = NULL;
-	msg->ackMask = 0;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
-	msg->req.beginLsn = startpos;
-	msg->req.endLsn = endpos;
-	msg->req.proposerId = proposerGreeting.proposerId;
-	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
+							/*
+							 * If we need to poll to finish connecting,
+							 * continue doing that
+							 */
+						case WP_CONN_POLLING_READING:
+							wk->state = SS_CONNECTING_READ;
+							new_events = WL_SOCKET_READABLE;
+							break;
+						case WP_CONN_POLLING_WRITING:
+							wk->state = SS_CONNECTING_WRITE;
+							new_events = WL_SOCKET_WRITEABLE;
+							break;
 
-	Assert(msg->req.endLsn >= lastSentLsn);
-	lastSentLsn = msg->req.endLsn;
-	return msg;
-}
+						case WP_CONN_POLLING_FAILED:
+							elog(WARNING, "Failed to connect to node '%s:%s': %s",
+								 wk->host, wk->port, walprop_error_message(wk->conn));
 
-void
-WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
-{
-	WalMessage *msg = CreateMessage(startpos, data, len);
+							/*
+							 * If connecting failed, we don't want to restart
+							 * the connection because that might run us into a
+							 * loop. Instead, shut it down -- it'll naturally
+							 * restart at a slower interval on calls to
+							 * ReconnectWalKeepers.
+							 */
+							ShutdownConnection(wk);
+							return;
+					}
 
-	if (msg != NULL)
-		BroadcastMessage(msg);
-}
+					/*
+					 * Because PQconnectPoll can change the socket, we have to
+					 * un-register the old event and re-register an event on
+					 * the new socket.
+					 */
+					HackyRemoveWalProposerEvent(wk);
+					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
+					break;
+				}
 
-/*
- * Create WAL message with no data, just to let the walkeepers
- * know that commit lsn has advanced.
- */
-static WalMessage *
-CreateMessageCommitLsnOnly(XLogRecPtr lsn)
-{
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
+				/*
+				 * Send "START_WAL_PUSH" command to the walkeeper. After
+				 * sending, wait for response with SS_WAIT_EXEC_RESULT
+				 */
+			case SS_EXEC_STARTWALPUSH:
+				{
+					char *query = NULL;
+					if (zenith_pageserver_connstring_walproposer != NULL) {
+						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
+					} else {
+						query = psprintf("START_WAL_PUSH");
+					}
+					if (!walprop_send_query(wk->conn, query))
+					{
+						pfree(query);
+						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+							wk->host, wk->port, walprop_error_message(wk->conn));
+						ShutdownConnection(wk);
+						return;
+					}
+					pfree(query);
+					wk->state = SS_WAIT_EXEC_RESULT;
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+					break;
+				}
 
-	msg = (WalMessage *) malloc(sizeof(WalMessage));
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
+			case SS_WAIT_EXEC_RESULT:
+				switch (walprop_get_query_result(wk->conn))
+				{
+						/*
+						 * Successful result, move on to starting the
+						 * handshake
+						 */
+					case WP_EXEC_SUCCESS_COPYBOTH:
 
-	msg->size = sizeof(AppendRequestHeader);
-	msg->next = NULL;
-	msg->ackMask = 0;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
+						/*
+						 * Because this state is immediately executable, we'll
+						 * start this on the next iteration of the loop
+						 */
+						wk->state = SS_HANDSHAKE_SEND;
+						break;
 
-	/*
-	 * This serves two purposes: 1) After all msgs from previous epochs are
-	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
-	 * commands to switch the epoch, which allows to do the switch without
-	 * creating new epoch records (we especially want to avoid such in --sync
-	 * mode). Walproposer can advance commit_lsn only after the switch, so
-	 * this lsn (reported back) also is the first possible advancement point.
-	 * 2) Maintain common invariant of queue entries sorted by LSN.
-	 */
-	msg->req.beginLsn = lsn;
-	msg->req.endLsn = lsn;
-	msg->req.proposerId = proposerGreeting.proposerId;
+						/*
+						 * Needs repeated calls to finish. Wait until the
+						 * socket is readable
+						 */
+					case WP_EXEC_NEEDS_INPUT:
 
-	/*
-	 * truncateLsn and commitLsn are set just before the message sent, in
-	 * SendMessageToNode()
-	 */
-	return msg;
-}
+						/*
+						 * SS_WAIT_EXEC_RESULT is always reached through an
+						 * event, so we don't need to update the event set
+						 */
+						break;
 
-/* latest term in TermHistory, or 0 is there is no entries */
-static term_t
-GetHighestTerm(TermHistory *th)
-{
-	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
-}
+					case WP_EXEC_FAILED:
+						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
+							 wk->host, wk->port, walprop_error_message(wk->conn));
+						ShutdownConnection(wk);
+						return;
 
-/* safekeeper's epoch is the term of the highest entry in the log */
-static term_t
-GetEpoch(WalKeeper *wk)
-{
-	return GetHighestTerm(&wk->voteResponse.termHistory);
-}
+						/*
+						 * Unexpected result -- funamdentally an error, but we
+						 * want to produce a custom message, rather than a
+						 * generic "something went wrong"
+						 */
+					case WP_EXEC_UNEXPECTED_SUCCESS:
+						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
+							 wk->host, wk->port);
+						ShutdownConnection(wk);
+						return;
+				}
+				break;
 
-/*
- * Called after majority of acceptors gave votes, it calculates the most
- * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
- * which we'll write WAL in our term.
- *
- * Sets truncateLsn along the way (though it is not of much use at this point --
- * only for skipping recovery).
- */
-static void
-DetermineEpochStartLsn(void)
-{
-	TermHistory *dth;
+				/*
+				 * Start handshake: first of all send information about the
+				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+				 * a response to finish the handshake.
+				 */
+			case SS_HANDSHAKE_SEND:
 
-	propEpochStartLsn = InvalidXLogRecPtr;
-	donorEpoch = 0;
-	truncateLsn = InvalidXLogRecPtr;
+				/*
+				 * On failure, logging & resetting the connection is handled.
+				 * We just need to handle the control flow.
+				 */
+				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
+					return;
 
-	for (int i = 0; i < n_walkeepers; i++)
-	{
-		if (walkeeper[i].state == SS_IDLE)
-		{
-			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
-				(GetEpoch(&walkeeper[i]) == donorEpoch &&
-				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
-			{
-				donorEpoch = GetEpoch(&walkeeper[i]);
-				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
-				donor = i;
-			}
-			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
-		}
-	}
+				break;
 
-	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
-	 * was committed yet. To keep the idea of always starting streaming since
-	 * record boundary (which simplifies decoding on safekeeper), take start
-	 * position of the slot.
-	 */
-	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
-	{
-		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
-		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
-		ReplicationSlotRelease();
-		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
-	}
+				/*
+				 * Finish handshake comms: receive information about the WAL
+				 * keeper
+				 */
+			case SS_HANDSHAKE_RECV:
 
-	/*
-	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
-	 * some connected safekeeper; it must have carried truncateLsn pointing to
-	 * the first record.
-	 */
-	Assert((truncateLsn != InvalidXLogRecPtr) ||
-		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
+				/*
+				 * If our reading doesn't immediately succeed, any necessary
+				 * error handling or state setting is taken care of. We can
+				 * leave any other work until later.
+				 */
+				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
+					return;
 
-	/*
-	 * Proposer's term history is the donor's + its own entry.
-	 */
-	dth = &walkeeper[donor].voteResponse.termHistory;
-	propTermHistory.n_entries = dth->n_entries + 1;
-	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
-	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
-	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
-	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
+				/* Protocol is all good, move to voting. */
+				wk->state = SS_VOTING;
 
-	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		 quorum,
-		 propTerm,
-		 LSN_FORMAT_ARGS(propEpochStartLsn),
-		 walkeeper[donor].host, walkeeper[donor].port,
-		 LSN_FORMAT_ARGS(truncateLsn)
-		);
-}
+				/*
+				 * Don't need to update the event set yet. Either we update
+				 * the event set to WL_SOCKET_READABLE *or* we change the
+				 * state to SS_SEND_VOTE in the loop below
+				 */
+				UpdateEventSet(wk, WL_SOCKET_READABLE);
+				wk->feedback.flushLsn = truncateLsn;
+				wk->feedback.hs.ts = 0;
 
-/*
- * How much milliseconds left till we should attempt reconnection to
- * safekeepers? Returns 0 if it is already high time, -1 if we never reconnect
- * (do we actually need this?).
- */
-static long
-TimeToReconnect(TimestampTz now)
-{
-	TimestampTz passed;
-	TimestampTz till_reconnect;
+				/*
+				 * We want our term to be highest and unique, so choose max
+				 * and +1 once we have majority.
+				 */
+				propTerm = Max(walkeeper[i].greet.term, propTerm);
 
-	if (wal_acceptor_reconnect_timeout <= 0)
-		return -1;
+				/*
+				 * Check if we have quorum. If there aren't enough walkeepers,
+				 * wait and do nothing. We'll eventually get a task when the
+				 * election starts.
+				 *
+				 * If we do have quorum, we can start an election
+				 */
+				if (++n_connected < quorum)
+				{
+					/*
+					 * SS_VOTING is an idle state; read-ready indicates the
+					 * connection closed.
+					 */
+					UpdateEventSet(wk, WL_SOCKET_READABLE);
+				}
+				else
+				{
+					if (n_connected == quorum)
+					{
+						propTerm++;
+						/* prepare voting message */
+						voteRequest = (VoteRequest)
+						{
+							.tag = 'v',
+								.term = propTerm
+						};
+						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+					}
 
-	passed = now - last_reconnect_attempt;
-	till_reconnect = wal_acceptor_reconnect_timeout * 1000 - passed;
-	if (till_reconnect <= 0)
-		return 0;
-	return (long) (till_reconnect / 1000);
-}
+					/*
+					 * Now send voting request to the cohort and wait
+					 * responses
+					 */
+					for (int j = 0; j < n_walkeepers; j++)
+					{
+						/*
+						 * Remember: SS_VOTING indicates that the walkeeper is
+						 * participating in voting, but hasn't sent anything
+						 * yet. The ones that have sent something are given
+						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
+						 */
+						if (walkeeper[j].state == SS_VOTING)
+						{
+							walkeeper[j].state = SS_SEND_VOTE;
+							/* Immediately send info */
+							AdvancePollState(j, WL_NO_EVENTS);
+						}
+					}
+				}
+				break;
 
-/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
-static void
-ReconnectWalKeepers(void)
-{
-	TimestampTz now = GetCurrentTimestamp();
+				/*
+				 * Voting is an idle state - we don't expect any events to
+				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
+				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
+				 */
+			case SS_VOTING:
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(wk);
+				break;
 
-	if (TimeToReconnect(now) == 0)
-	{
-		last_reconnect_attempt = now;
-		for (int i = 0; i < n_walkeepers; i++)
-		{
-			if (walkeeper[i].state == SS_OFFLINE)
-				ResetConnection(&walkeeper[i]);
-		}
-	}
-}
+				/* We have quorum for voting, send our vote request */
+			case SS_SEND_VOTE:
+				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
+				/* On failure, logging & resetting is handled */
+				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+					return;
 
-/*
- * Receive WAL from most advanced WAL keeper
- */
-static bool
-WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
-{
-	char		conninfo[MAXCONNINFO];
-	char	   *err;
-	WalReceiverConn *wrconn;
-	WalRcvStreamOptions options;
+				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
+				break;
 
-	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
-	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
-	if (!wrconn)
-	{
-		ereport(WARNING,
-				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						walkeeper[donor].host, walkeeper[donor].port,
-						err)));
-		return false;
-	}
-	elog(LOG,
-		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
-		 "%d",
-		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
-		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+				/* Start reading the walkeeper response for our candidate */
+			case SS_WAIT_VERDICT:
+				wk->voteResponse.apm.tag = 'v';
+				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
+					return;
 
-	options.logical = false;
-	options.startpoint = startpos;
-	options.slotname = NULL;
-	options.proto.physical.startpointTLI = timeline;
+				elog(LOG,
+					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
+					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
+					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
 
-	if (walrcv_startstreaming(wrconn, &options))
-	{
-		XLogRecPtr	rec_start_lsn;
-		XLogRecPtr	rec_end_lsn = 0;
-		int			len;
-		char	   *buf;
-		pgsocket	wait_fd = PGINVALID_SOCKET;
+				/*
+				 * In case of acceptor rejecting our vote, bail out, but only
+				 * if either it already lives in strictly higher term
+				 * (concurrent compute spotted) or we are not elected yet and
+				 * thus need the vote.
+				 */
+				if ((!wk->voteResponse.voteGiven) &&
+					(wk->voteResponse.term > propTerm || n_votes < quorum))
+				{
+					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+						 wk->host, wk->port,
+						 wk->voteResponse.term, propTerm);
+				}
+				Assert(wk->voteResponse.term == propTerm);
+
+				/* Handshake completed, do we have quorum? */
+				n_votes++;
+				if (n_votes < quorum)
+				{
+					wk->state = SS_IDLE; /* can't do much yet, no quorum */
+				}
+				else if (n_votes > quorum)
+				{
 
-		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
-		{
-			if (len == 0)
-			{
-				(void) WaitLatchOrSocket(
-										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
-			}
-			else
-			{
-				Assert(buf[0] == 'w' || buf[0] == 'k');
-				if (buf[0] == 'k')
-					continue; /* keepalive */
-				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
-					   sizeof rec_start_lsn);
-				rec_start_lsn = pg_ntoh64(rec_start_lsn);
-				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-				(void) CreateMessage(rec_start_lsn, buf, len);
-				elog(DEBUG1, "Recover message %X/%X length %d",
-					 LSN_FORMAT_ARGS(rec_start_lsn), len);
-				if (rec_end_lsn >= endpos)
-					break;
-			}
-		}
-		elog(DEBUG1, "end of replication stream at %X/%X: %m",
-			 LSN_FORMAT_ARGS(rec_end_lsn));
-		walrcv_disconnect(wrconn);
-	}
-	else
-	{
-		ereport(LOG,
-				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
-						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
-		return false;
-	}
+					/* recovery already performed, just start streaming */
+					SendProposerElected(wk);
+				}
+				else
+				{
+					wk->state = SS_IDLE;
+					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+															 * read-ready */
 
-	return true;
-}
+					DetermineEpochStartLsn();
 
-/*
- * Determine for wk the starting streaming point and send it message
- * 1) Announcing we are elected proposer (which immediately advances epoch if
- *    safekeeper is synced, being important for sync-safekeepers)
- * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
- *    beyond it -- and history of term switching.
- * 
- * Sets wk->startStreamingAt.
- */
-static void
-SendProposerElected(WalKeeper *wk)
-{
-	ProposerElected msg;
-	TermHistory *th;
-	term_t lastCommonTerm;
-	int i;
+					/*
+					 * Check if not all safekeepers are up-to-date, we need to
+					 * download WAL needed to synchronize them
+					 */
+					if (truncateLsn < propEpochStartLsn)
+					{
+						elog(LOG,
+							 "start recovery because truncateLsn=%X/%X is not "
+							 "equal to epochStartLsn=%X/%X",
+							 LSN_FORMAT_ARGS(truncateLsn),
+							 LSN_FORMAT_ARGS(propEpochStartLsn));
+						/* Perform recovery */
+						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
+							elog(FATAL, "Failed to recover state");
+					}
+					else if (syncSafekeepers)
+					{
+						/* Sync is not needed: just exit */
+						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+						exit(0);
+					}
 
-	/* 
-	 * Determine start LSN by comparing safekeeper's log term switch history and
-	 * proposer's, searching for the divergence point.
-	 *
-	 * Note: there is a vanishingly small chance of no common point even if
-	 * there is some WAL on safekeeper, if immediately after bootstrap compute
-	 * wrote some WAL on single sk and died; we stream since the beginning then.
-	 */
-	th = &wk->voteResponse.termHistory;
-	/* 
-	 * If any WAL is present on the sk, it must be authorized by some term.
-	 * OTOH, without any WAL there are no term swiches in the log.
-	 */
-	Assert((th->n_entries == 0) ==
-		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
-	/* We must start somewhere. */
-	Assert(propTermHistory.n_entries >= 1);
+					for (int i = 0; i < n_walkeepers; i++)
+					{
+						if (walkeeper[i].state == SS_IDLE)
+							SendProposerElected(&walkeeper[i]);
+					}
 
-	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
-	{
-		if (propTermHistory.entries[i].term != th->entries[i].term)
-			break;
-		/* term must begin everywhere at the same point */
-		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
-	}
-	i--; /* step back to the last common term */
-	if (i < 0)
-	{
-		/* safekeeper is empty or no common point, start from the beginning */
-		wk->startStreamingAt = propTermHistory.entries[0].lsn;
-	}
-	else
-	{
-		/*
-		 * End of (common) term is the start of the next except it is the last
-		 * one; there it is flush_lsn in case of safekeeper or, in case of
-		 * proposer, LSN it is currently writing, but then we just pick
-		 * safekeeper pos as it obviously can't be higher.
-		 */
-		if (propTermHistory.entries[i].term == propTerm)
-		{
-			wk->startStreamingAt = wk->voteResponse.flushLsn;
-		}
-		else
-		{
-			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
-			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
-														   wk->voteResponse.flushLsn);
-			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
-		}
-	}
+					/* 
+					 * The proposer has been elected, and there will be no quorum waiting
+					 * after this point. There will be no safekeeper with state SS_IDLE
+					 * also, because that state is used only for quorum waiting.
+					 */
 
-	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+					if (syncSafekeepers)
+					{
+						/*
+						 * Queue empty message to enforce receiving feedback
+						 * even from nodes who are fully recovered; this is
+						 * required to learn they switched epoch which finishes
+						 * sync-safeekepers who doesn't generate any real new
+						 * records. Will go away once we switch to async acks.
+						 */
+						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
-	msg.tag = 'e';
-	msg.term = propTerm;
-	msg.startStreamingAt = wk->startStreamingAt;
-	msg.termHistory = &propTermHistory;
+						/* keep polling until all walkeepers are synced */
+						return;
+					}
 
-	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
-	elog(LOG,
-		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
-	
-	resetStringInfo(&wk->outbuf);
-	pq_sendint64_le(&wk->outbuf, msg.tag);
-	pq_sendint64_le(&wk->outbuf, msg.term);
-	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
-	for (int i = 0; i < msg.termHistory->n_entries; i++)
-	{
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
-	}
+					WalProposerStartStreaming(propEpochStartLsn);
+					/* Should not return here */
+				}
 
-	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
-		return;
+				break;
 
-	StartStreaming(wk);
-}
+			/* Flush proposer announcement message */
+			case SS_SEND_ELECTED_FLUSH:
 
-/*
- * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
- */
-static void
-StartStreaming(WalKeeper *wk)
-{
-	int wki = wk - walkeeper;
+				/*
+				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
+				 * the flush completes. If we still have more to do, we'll
+				 * wait until the next poll comes along.
+				 */
+				if (!AsyncFlush(wk))
+					return;
+				
+				StartStreaming(wk);
 
-	/* 
-	 * This is the only entrypoint to state SS_ACTIVE. It's executed
-	 * exactly once for a connection.
-	 */
-	wk->state = SS_ACTIVE;
-	UpdateEventSet(wk, WL_SOCKET_READABLE);
+				break;
 
-	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-	{
-		if (msg->req.endLsn <= wk->startStreamingAt)
-		{
-			/* message is already received by this walkeeper */
-			msg->ackMask |= 1 << wki;
-		}
-		else
-		{
-			SendMessageToNode(wki, msg);
-			return;
-		}
-	}
-}
 
-/*
- * Advance the WAL proposer state machine, waiting each time for events to occur
- */
-void
-WalProposerPoll(void)
-{
-	while (true)
-	{
-		WalKeeper  *wk;
-		int			rc;
-		int			i;
-		WaitEvent	event;
-		TimestampTz now = GetCurrentTimestamp();
+				/*
+				 * Idle state for sending WAL. Moved out only by calls to
+				 * SendMessageToNode
+				 */
+			case SS_IDLE:
+				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					 wk->port, FormatWalKeeperState(wk->state));
+				ResetConnection(wk);
+				break;
 
-		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
-							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		wk = (WalKeeper *) event.user_data;
-		i = (int) (wk - walkeeper);
 
-		/*
-		 * If the event contains something that one of our walkeeper states
-		 * was waiting for, we'll advance its state.
-		 */
-		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(i, event.events);
+			case SS_ACTIVE:
+				if (events & WL_SOCKET_WRITEABLE)
+					if (!SendAppendRequests(wk))
+						return;
+
+				if (events & WL_SOCKET_READABLE)
+					if (!RecvAppendResponses(wk))
+						return;
 
-		/*
-		 * If the timeout expired, attempt to reconnect to any walkeepers that
-		 * we dropped
-		 */
-		ReconnectWalKeepers();
+				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
+				break;
+		}
 
 		/*
-		 * If wait is terminated by latch set (walsenders' latch is set on
-		 * each wal flush), then exit loop. (no need for pm death check due to
-		 * WL_EXIT_ON_PM_DEATH)
+		 * We've already done something for these events - don't attempt more
+		 * states than we need to.
 		 */
-		if (rc != 0 && (event.events & WL_LATCH_SET))
-		{
-			ResetLatch(MyLatch);
-			break;
-		}
-		if (rc == 0) /* timeout expired: poll state */
-		{
-			/*
-			 * If no WAL was generated during timeout (and we have already
-			 * collected the quorum), then send pool message
-			 */
-			if (lastSentLsn != InvalidXLogRecPtr)
-			{
-				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-			}
-		}
+		events = WL_NO_EVENTS;
 	}
 }
 
+/* latest term in TermHistory, or 0 is there is no entries */
+static term_t
+GetHighestTerm(TermHistory *th)
+{
+	return th->n_entries > 0 ? th->entries[th->n_entries - 1].term : 0;
+}
+
+/* safekeeper's epoch is the term of the highest entry in the log */
+static term_t
+GetEpoch(WalKeeper *wk)
+{
+	return GetHighestTerm(&wk->voteResponse.termHistory);
+}
+
 /*
- * Send queue messages starting from wk->currMsg until the end or non-writable
- * socket, whichever comes first.
- * 
- * Can change state if Async* functions encounter errors and reset connection.
- * Returns false in this case, true otherwise.
+ * Called after majority of acceptors gave votes, it calculates the most
+ * advanced safekeeper (who will be the donor) and epochStartLsn -- LSN since
+ * which we'll write WAL in our term.
+ *
+ * Sets truncateLsn along the way (though it is not of much use at this point --
+ * only for skipping recovery).
  */
-static bool
-SendAppendRequests(WalKeeper *wk)
+static void
+DetermineEpochStartLsn(void)
 {
-	int wki = wk - walkeeper;
-	WalMessage *msg;
-	AppendRequestHeader *req;
+	TermHistory *dth;
 
-	if (wk->flushWrite)
-	{
-		if (!AsyncFlush(wk))
-			/* 
-			 * AsyncFlush failed, that could happen if the socket is closed or
-			 * we have nothing to write and should wait for writeable socket.
-			 */
-			return wk->state == SS_ACTIVE;
+	propEpochStartLsn = InvalidXLogRecPtr;
+	donorEpoch = 0;
+	truncateLsn = InvalidXLogRecPtr;
 
-		wk->currMsg = wk->currMsg->next;
-		wk->flushWrite = false;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE)
+		{
+			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
+				(GetEpoch(&walkeeper[i]) == donorEpoch &&
+				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
+			{
+				donorEpoch = GetEpoch(&walkeeper[i]);
+				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
+				donor = i;
+			}
+			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
+		}
 	}
 
-	while (wk->currMsg)
+	/*
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
+	 * was committed yet. To keep the idea of always starting streaming since
+	 * record boundary (which simplifies decoding on safekeeper), take start
+	 * position of the slot.
+	 */
+	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
-		msg = wk->currMsg;
-		req = &msg->req;
+		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
+		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		ReplicationSlotRelease();
+		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
+	}
 
-		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
-		req->truncateLsn = truncateLsn;
+	/*
+	 * If propEpochStartLsn is not 0, at least one msg with WAL was sent to
+	 * some connected safekeeper; it must have carried truncateLsn pointing to
+	 * the first record.
+	 */
+	Assert((truncateLsn != InvalidXLogRecPtr) ||
+		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
 
-		Assert((msg->ackMask & (1 << wki)) == 0);
+	/*
+	 * Proposer's term history is the donor's + its own entry.
+	 */
+	dth = &walkeeper[donor].voteResponse.termHistory;
+	propTermHistory.n_entries = dth->n_entries + 1;
+	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
+	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
+	propTermHistory.entries[propTermHistory.n_entries - 1].term = propTerm;
+	propTermHistory.entries[propTermHistory.n_entries - 1].lsn = propEpochStartLsn;
 
-		/*
-		 * If we need to send this message not from the beginning,
-		 * form the cut version. Only happens for the first
-		 * message.
-		 */
-		if (wk->startStreamingAt > msg->req.beginLsn)
-		{
-			uint32		len;
-			uint32		size;
+	elog(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+		 quorum,
+		 propTerm,
+		 LSN_FORMAT_ARGS(propEpochStartLsn),
+		 walkeeper[donor].host, walkeeper[donor].port,
+		 LSN_FORMAT_ARGS(truncateLsn)
+		);
+}
 
-			Assert(wk->startStreamingAt < req->endLsn);
+/*
+ * Receive WAL from most advanced WAL keeper
+ */
+static bool
+WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+{
+	char		conninfo[MAXCONNINFO];
+	char	   *err;
+	WalReceiverConn *wrconn;
+	WalRcvStreamOptions options;
 
-			len = msg->req.endLsn - wk->startStreamingAt;
-			size = sizeof(AppendRequestHeader) + len;
-			req = malloc(size);
-			*req = msg->req;
-			req->beginLsn = wk->startStreamingAt;
-			memcpy(req + 1,
-					(char *) (&msg->req + 1) + wk->startStreamingAt -
-					msg->req.beginLsn,
-					len);
-		}
+	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
+			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+	if (!wrconn)
+	{
+		ereport(WARNING,
+				(errmsg("could not connect to WAL acceptor %s:%s: %s",
+						walkeeper[donor].host, walkeeper[donor].port,
+						err)));
+		return false;
+	}
+	elog(LOG,
+		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
-		elog(LOG,
-				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-				req->endLsn - req->beginLsn,
-				LSN_FORMAT_ARGS(req->beginLsn),
-				LSN_FORMAT_ARGS(req->endLsn),
-				LSN_FORMAT_ARGS(req->commitLsn),
-				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+	options.logical = false;
+	options.startpoint = startpos;
+	options.slotname = NULL;
+	options.proto.physical.startpointTLI = timeline;
 
-		/* if this is the first sent message, we should start processing feedback */
-		if (wk->ackMsg == NULL)
-			wk->ackMsg = wk->currMsg;
+	if (walrcv_startstreaming(wrconn, &options))
+	{
+		XLogRecPtr	rec_start_lsn;
+		XLogRecPtr	rec_end_lsn = 0;
+		int			len;
+		char	   *buf;
+		pgsocket	wait_fd = PGINVALID_SOCKET;
 
-		/*
-		 * We write with msg->size here because the body of the
-		 * message is stored after the end of the WalMessage
-		 * struct, in the allocation for each msg
-		 */
-		if (!AsyncWrite(wk, req,
-						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
-						SS_ACTIVE))
+		while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
 		{
-			if (req != &msg->req)
-				free(req);
-			if (wk->state == SS_ACTIVE)
+			if (len == 0)
 			{
-				wk->flushWrite = true;
-				return true;
+				(void) WaitLatchOrSocket(
+										 MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
+										 -1, WAIT_EVENT_WAL_RECEIVER_MAIN);
+			}
+			else
+			{
+				Assert(buf[0] == 'w' || buf[0] == 'k');
+				if (buf[0] == 'k')
+					continue; /* keepalive */
+				memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
+					   sizeof rec_start_lsn);
+				rec_start_lsn = pg_ntoh64(rec_start_lsn);
+				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
+				(void) CreateMessage(rec_start_lsn, buf, len);
+				elog(DEBUG1, "Recover message %X/%X length %d",
+					 LSN_FORMAT_ARGS(rec_start_lsn), len);
+				if (rec_end_lsn >= endpos)
+					break;
 			}
-			return false;
 		}
-		if (req != &msg->req)
-			free(req);
-
-		/* continue writing the next message */
-		wk->currMsg = wk->currMsg->next;
+		elog(DEBUG1, "end of replication stream at %X/%X: %m",
+			 LSN_FORMAT_ARGS(rec_end_lsn));
+		walrcv_disconnect(wrconn);
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
+						timeline, (uint32) (startpos >> 32), (uint32) startpos)));
+		return false;
 	}
 
 	return true;
 }
 
 /*
- * Receive and process all available feedback.
- *
- * Can change state if Async* functions encounter errors and reset connection.
- * Returns false in this case, true otherwise.
+ * Determine for wk the starting streaming point and send it message
+ * 1) Announcing we are elected proposer (which immediately advances epoch if
+ *    safekeeper is synced, being important for sync-safekeepers)
+ * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
+ *    beyond it -- and history of term switching.
  * 
- * NB: This function can call SendMessageToNode and produce new messages.
+ * Sets wk->startStreamingAt.
  */
-static bool
-RecvAppendResponses(WalKeeper *wk)
+static void
+SendProposerElected(WalKeeper *wk)
 {
-	XLogRecPtr	minQuorumLsn;
-	int wki = wk - walkeeper;
-	bool readAnything = false;
+	ProposerElected msg;
+	TermHistory *th;
+	term_t lastCommonTerm;
+	int i;
 
-	while (true)
+	/* 
+	 * Determine start LSN by comparing safekeeper's log term switch history and
+	 * proposer's, searching for the divergence point.
+	 *
+	 * Note: there is a vanishingly small chance of no common point even if
+	 * there is some WAL on safekeeper, if immediately after bootstrap compute
+	 * wrote some WAL on single sk and died; we stream since the beginning then.
+	 */
+	th = &wk->voteResponse.termHistory;
+	/* 
+	 * If any WAL is present on the sk, it must be authorized by some term.
+	 * OTOH, without any WAL there are no term swiches in the log.
+	 */
+	Assert((th->n_entries == 0) ==
+		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
+	/* We must start somewhere. */
+	Assert(propTermHistory.n_entries >= 1);
+
+	for (i = 0; i < Min(propTermHistory.n_entries, th->n_entries); i++)
 	{
-		/*
-		 * If our reading doesn't immediately succeed, any
-		 * necessary error handling or state setting is taken care
-		 * of. We can leave any other work until later.
-		 */
-		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+		if (propTermHistory.entries[i].term != th->entries[i].term)
 			break;
-
-		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
-
+		/* term must begin everywhere at the same point */
+		Assert(propTermHistory.entries[i].lsn == th->entries[i].lsn);
+	}
+	i--; /* step back to the last common term */
+	if (i < 0)
+	{
+		/* safekeeper is empty or no common point, start from the beginning */
+		wk->startStreamingAt = propTermHistory.entries[0].lsn;
+	}
+	else
+	{
 		/*
-		 * We shouldn't read responses ahead of wk->currMsg, because that will
-		 * look like we are receiving responses for messages that haven't been
-		 * sent yet. This can happen when message was placed in a buffer in 
-		 * SendAppendRequests, but sent through a wire only with a flush inside
-		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 * End of (common) term is the start of the next except it is the last
+		 * one; there it is flush_lsn in case of safekeeper or, in case of
+		 * proposer, LSN it is currently writing, but then we just pick
+		 * safekeeper pos as it obviously can't be higher.
 		 */
-		if (wk->ackMsg == wk->currMsg)
+		if (propTermHistory.entries[i].term == propTerm)
 		{
-			/* Couldn't happen without flush flag */
-			Assert(wk->flushWrite);
-
-			wk->currMsg = wk->currMsg->next;
-			wk->flushWrite = false;
+			wk->startStreamingAt = wk->voteResponse.flushLsn;
+		}
+		else
+		{
+			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
+			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
+														   wk->voteResponse.flushLsn);
+			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
 		}
+	}
 
-		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
-											* receiving of this
-											* message */
+	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
 
-		wk->ackMsg = wk->ackMsg->next;
-		readAnything = true;
+	msg.tag = 'e';
+	msg.term = propTerm;
+	msg.startStreamingAt = wk->startStreamingAt;
+	msg.termHistory = &propTermHistory;
+
+	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
+	elog(LOG,
+		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
+		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
+	
+	resetStringInfo(&wk->outbuf);
+	pq_sendint64_le(&wk->outbuf, msg.tag);
+	pq_sendint64_le(&wk->outbuf, msg.term);
+	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
+	for (int i = 0; i < msg.termHistory->n_entries; i++)
+	{
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
 	}
 
-	if (!readAnything)
-		return wk->state == SS_ACTIVE;
+	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+		return;
 
-	HandleWalKeeperResponse();
+	StartStreaming(wk);
+}
 
-	/*
-	 * Also send the new commit lsn to all the walkeepers.
-	 *
-	 * FIXME: This is redundant for walkeepers that have other
-	 * outbound messages pending.
+/*
+ * Start walsender streaming replication
+ */
+static void
+WalProposerStartStreaming(XLogRecPtr startpos)
+{
+	StartReplicationCmd cmd;
+
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
+	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
+	cmd.timeline = proposerGreeting.timeline;
+	cmd.startpoint = startpos;
+	StartReplication(&cmd);
+}
+
+/*
+ * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
+ */
+static void
+StartStreaming(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+
+	/* 
+	 * This is the only entrypoint to state SS_ACTIVE. It's executed
+	 * exactly once for a connection.
 	 */
-	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	if (minQuorumLsn > lastSentCommitLsn)
+	wk->state = SS_ACTIVE;
+	UpdateEventSet(wk, WL_SOCKET_READABLE);
+
+	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
-		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
-		lastSentCommitLsn = minQuorumLsn;
+		if (msg->req.endLsn <= wk->startStreamingAt)
+		{
+			/* message is already received by this walkeeper */
+			msg->ackMask |= 1 << wki;
+		}
+		else
+		{
+			SendMessageToNode(wki, msg);
+			return;
+		}
 	}
-
-	return wk->state == SS_ACTIVE;
 }
 
-/* Performs the logic for advancing the state machine of the 'i'th walkeeper,
- * given that a certain set of events has occured. */
+/*
+ * Start sending message to the particular node.
+ *
+ * Always updates the state and event set for the WAL keeper; setting either of
+ * these before calling would be redundant work.
+ */
 static void
-AdvancePollState(int i, uint32 events)
+SendMessageToNode(int i, WalMessage *msg)
 {
 	WalKeeper  *wk = &walkeeper[i];
+
+	/* we shouldn't be already sending something */
+	Assert(wk->currMsg == NULL);
+
 	/*
-	 * Keep advancing the state while either: (a) the event is still
-	 * unprocessed (usually because it's the first iteration of the loop), or
-	 * (b) the state can execute, and does not need to wait for any socket
-	 * events
+	 * Skip already acknowledged messages. Used after reconnection to get to
+	 * the first not yet sent message. Otherwise we always just send 'msg'.
 	 */
-	while (events || StateShouldImmediatelyExecute(wk->state))
-	{
-		/*
-		 * Sanity check. We assume further down that the operations don't
-		 * block because the socket is ready.
-		 */
-		AssertEventsOkForState(events, wk);
+	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
+		msg = msg->next;
 
-		/* Execute the code corresponding to the current state */
-		switch (wk->state)
+	wk->currMsg = msg;
+	wk->flushWrite = false;
+
+	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
+	if (!SendAppendRequests(wk))
+		return;
+}
+
+/*
+ * Broadcast new message to all caught-up walkeepers
+ */
+static void
+BroadcastMessage(WalMessage *msg)
+{
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
 		{
-				/*
-				 * WAL keepers are only taken out of SS_OFFLINE by calls to
-				 * ResetConnection
-				 */
-			case SS_OFFLINE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
-					 wk->host, wk->port);
-				break;			/* actually unreachable, but prevents
-								 * -Wimplicit-fallthrough */
+			SendMessageToNode(i, msg);
+		}
+	}
+}
+
+static WalMessage *
+CreateMessage(XLogRecPtr startpos, char *data, int len)
+{
+	/* Create new message and append it to message queue */
+	WalMessage *msg;
+	XLogRecPtr	endpos;
+
+	len -= XLOG_HDR_SIZE;
+	endpos = startpos + len;
+	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
+	{
+		/* Message already queued */
+		return NULL;
+	}
+	Assert(len >= 0);
+	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
+
+	msg->size = sizeof(AppendRequestHeader) + len;
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.epochStartLsn = propEpochStartLsn;
+	msg->req.beginLsn = startpos;
+	msg->req.endLsn = endpos;
+	msg->req.proposerId = proposerGreeting.proposerId;
+	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
 
-				/*
-				 * Both connecting states run the same logic. The only
-				 * difference is the events they're expecting
-				 */
-			case SS_CONNECTING_READ:
-			case SS_CONNECTING_WRITE:
-				{
-					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
+	Assert(msg->req.endLsn >= lastSentLsn);
+	lastSentLsn = msg->req.endLsn;
+	return msg;
+}
 
-					/* The new set of events we'll wait on, after updating */
-					uint32		new_events = WL_NO_EVENTS;
+/*
+ * Create WAL message with no data, just to let the walkeepers
+ * know that commit lsn has advanced.
+ */
+static WalMessage *
+CreateMessageCommitLsnOnly(XLogRecPtr lsn)
+{
+	/* Create new message and append it to message queue */
+	WalMessage *msg;
 
-					switch (result)
-					{
-						case WP_CONN_POLLING_OK:
-							elog(LOG, "connected with node %s:%s", wk->host,
-								 wk->port);
+	msg = (WalMessage *) malloc(sizeof(WalMessage));
+	if (msgQueueTail != NULL)
+		msgQueueTail->next = msg;
+	else
+		msgQueueHead = msg;
+	msgQueueTail = msg;
 
-							/*
-							 * Once we're fully connected, we can move to the
-							 * next state
-							 */
-							wk->state = SS_EXEC_STARTWALPUSH;
+	msg->size = sizeof(AppendRequestHeader);
+	msg->next = NULL;
+	msg->ackMask = 0;
+	msg->req.tag = 'a';
+	msg->req.term = propTerm;
+	msg->req.epochStartLsn = propEpochStartLsn;
 
-							/*
-							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
-							 * on anything, we do need to replace the current
-							 * event, so we have to just pick something. We'll
-							 * eventually need the socket to be readable, so
-							 * we go with that.
-							 */
-							new_events = WL_SOCKET_READABLE;
-							break;
+	/*
+	 * This serves two purposes: 1) After all msgs from previous epochs are
+	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
+	 * commands to switch the epoch, which allows to do the switch without
+	 * creating new epoch records (we especially want to avoid such in --sync
+	 * mode). Walproposer can advance commit_lsn only after the switch, so
+	 * this lsn (reported back) also is the first possible advancement point.
+	 * 2) Maintain common invariant of queue entries sorted by LSN.
+	 */
+	msg->req.beginLsn = lsn;
+	msg->req.endLsn = lsn;
+	msg->req.proposerId = proposerGreeting.proposerId;
 
-							/*
-							 * If we need to poll to finish connecting,
-							 * continue doing that
-							 */
-						case WP_CONN_POLLING_READING:
-							wk->state = SS_CONNECTING_READ;
-							new_events = WL_SOCKET_READABLE;
-							break;
-						case WP_CONN_POLLING_WRITING:
-							wk->state = SS_CONNECTING_WRITE;
-							new_events = WL_SOCKET_WRITEABLE;
-							break;
+	/*
+	 * truncateLsn and commitLsn are set just before the message sent, in
+	 * SendMessageToNode()
+	 */
+	return msg;
+}
 
-						case WP_CONN_POLLING_FAILED:
-							elog(WARNING, "Failed to connect to node '%s:%s': %s",
-								 wk->host, wk->port, walprop_error_message(wk->conn));
+/*
+ * Send queue messages starting from wk->currMsg until the end or non-writable
+ * socket, whichever comes first.
+ * 
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ */
+static bool
+SendAppendRequests(WalKeeper *wk)
+{
+	int wki = wk - walkeeper;
+	WalMessage *msg;
+	AppendRequestHeader *req;
 
-							/*
-							 * If connecting failed, we don't want to restart
-							 * the connection because that might run us into a
-							 * loop. Instead, shut it down -- it'll naturally
-							 * restart at a slower interval on calls to
-							 * ReconnectWalKeepers.
-							 */
-							ShutdownConnection(wk);
-							return;
-					}
+	if (wk->flushWrite)
+	{
+		if (!AsyncFlush(wk))
+			/* 
+			 * AsyncFlush failed, that could happen if the socket is closed or
+			 * we have nothing to write and should wait for writeable socket.
+			 */
+			return wk->state == SS_ACTIVE;
 
-					/*
-					 * Because PQconnectPoll can change the socket, we have to
-					 * un-register the old event and re-register an event on
-					 * the new socket.
-					 */
-					HackyRemoveWalProposerEvent(wk);
-					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
-					break;
-				}
+		wk->currMsg = wk->currMsg->next;
+		wk->flushWrite = false;
+	}
 
-				/*
-				 * Send "START_WAL_PUSH" command to the walkeeper. After
-				 * sending, wait for response with SS_WAIT_EXEC_RESULT
-				 */
-			case SS_EXEC_STARTWALPUSH:
-				{
-					char *query = NULL;
-					if (zenith_pageserver_connstring_walproposer != NULL) {
-						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
-					} else {
-						query = psprintf("START_WAL_PUSH");
-					}
-					if (!walprop_send_query(wk->conn, query))
-					{
-						pfree(query);
-						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
-					}
-					pfree(query);
-					wk->state = SS_WAIT_EXEC_RESULT;
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-					break;
-				}
+	while (wk->currMsg)
+	{
+		msg = wk->currMsg;
+		req = &msg->req;
 
-			case SS_WAIT_EXEC_RESULT:
-				switch (walprop_get_query_result(wk->conn))
-				{
-						/*
-						 * Successful result, move on to starting the
-						 * handshake
-						 */
-					case WP_EXEC_SUCCESS_COPYBOTH:
+		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+		req->truncateLsn = truncateLsn;
 
-						/*
-						 * Because this state is immediately executable, we'll
-						 * start this on the next iteration of the loop
-						 */
-						wk->state = SS_HANDSHAKE_SEND;
-						break;
+		Assert((msg->ackMask & (1 << wki)) == 0);
 
-						/*
-						 * Needs repeated calls to finish. Wait until the
-						 * socket is readable
-						 */
-					case WP_EXEC_NEEDS_INPUT:
+		/*
+		 * If we need to send this message not from the beginning,
+		 * form the cut version. Only happens for the first
+		 * message.
+		 */
+		if (wk->startStreamingAt > msg->req.beginLsn)
+		{
+			uint32		len;
+			uint32		size;
 
-						/*
-						 * SS_WAIT_EXEC_RESULT is always reached through an
-						 * event, so we don't need to update the event set
-						 */
-						break;
+			Assert(wk->startStreamingAt < req->endLsn);
 
-					case WP_EXEC_FAILED:
-						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
+			len = msg->req.endLsn - wk->startStreamingAt;
+			size = sizeof(AppendRequestHeader) + len;
+			req = malloc(size);
+			*req = msg->req;
+			req->beginLsn = wk->startStreamingAt;
+			memcpy(req + 1,
+					(char *) (&msg->req + 1) + wk->startStreamingAt -
+					msg->req.beginLsn,
+					len);
+		}
 
-						/*
-						 * Unexpected result -- funamdentally an error, but we
-						 * want to produce a custom message, rather than a
-						 * generic "something went wrong"
-						 */
-					case WP_EXEC_UNEXPECTED_SUCCESS:
-						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
-							 wk->host, wk->port);
-						ShutdownConnection(wk);
-						return;
-				}
-				break;
+		elog(LOG,
+				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				req->endLsn - req->beginLsn,
+				LSN_FORMAT_ARGS(req->beginLsn),
+				LSN_FORMAT_ARGS(req->endLsn),
+				LSN_FORMAT_ARGS(req->commitLsn),
+				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
 
-				/*
-				 * Start handshake: first of all send information about the
-				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
-				 * a response to finish the handshake.
-				 */
-			case SS_HANDSHAKE_SEND:
+		/* if this is the first sent message, we should start processing feedback */
+		if (wk->ackMsg == NULL)
+			wk->ackMsg = wk->currMsg;
 
-				/*
-				 * On failure, logging & resetting the connection is handled.
-				 * We just need to handle the control flow.
-				 */
-				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
-					return;
+		/*
+		 * We write with msg->size here because the body of the
+		 * message is stored after the end of the WalMessage
+		 * struct, in the allocation for each msg
+		 */
+		if (!AsyncWrite(wk, req,
+						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
+						SS_ACTIVE))
+		{
+			if (req != &msg->req)
+				free(req);
+			if (wk->state == SS_ACTIVE)
+			{
+				wk->flushWrite = true;
+				return true;
+			}
+			return false;
+		}
+		if (req != &msg->req)
+			free(req);
 
-				break;
+		/* continue writing the next message */
+		wk->currMsg = wk->currMsg->next;
+	}
 
-				/*
-				 * Finish handshake comms: receive information about the WAL
-				 * keeper
-				 */
-			case SS_HANDSHAKE_RECV:
+	return true;
+}
 
-				/*
-				 * If our reading doesn't immediately succeed, any necessary
-				 * error handling or state setting is taken care of. We can
-				 * leave any other work until later.
-				 */
-				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
-					return;
+/*
+ * Receive and process all available feedback.
+ *
+ * Can change state if Async* functions encounter errors and reset connection.
+ * Returns false in this case, true otherwise.
+ * 
+ * NB: This function can call SendMessageToNode and produce new messages.
+ */
+static bool
+RecvAppendResponses(WalKeeper *wk)
+{
+	XLogRecPtr	minQuorumLsn;
+	int wki = wk - walkeeper;
+	bool readAnything = false;
 
-				/* Protocol is all good, move to voting. */
-				wk->state = SS_VOTING;
+	while (true)
+	{
+		/*
+		 * If our reading doesn't immediately succeed, any
+		 * necessary error handling or state setting is taken care
+		 * of. We can leave any other work until later.
+		 */
+		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+			break;
 
-				/*
-				 * Don't need to update the event set yet. Either we update
-				 * the event set to WL_SOCKET_READABLE *or* we change the
-				 * state to SS_SEND_VOTE in the loop below
-				 */
-				UpdateEventSet(wk, WL_SOCKET_READABLE);
-				wk->feedback.flushLsn = truncateLsn;
-				wk->feedback.hs.ts = 0;
+		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
 
-				/*
-				 * We want our term to be highest and unique, so choose max
-				 * and +1 once we have majority.
-				 */
-				propTerm = Max(walkeeper[i].greet.term, propTerm);
+		/*
+		 * We shouldn't read responses ahead of wk->currMsg, because that will
+		 * look like we are receiving responses for messages that haven't been
+		 * sent yet. This can happen when message was placed in a buffer in 
+		 * SendAppendRequests, but sent through a wire only with a flush inside
+		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 */
+		if (wk->ackMsg == wk->currMsg)
+		{
+			/* Couldn't happen without flush flag */
+			Assert(wk->flushWrite);
 
-				/*
-				 * Check if we have quorum. If there aren't enough walkeepers,
-				 * wait and do nothing. We'll eventually get a task when the
-				 * election starts.
-				 *
-				 * If we do have quorum, we can start an election
-				 */
-				if (++n_connected < quorum)
-				{
-					/*
-					 * SS_VOTING is an idle state; read-ready indicates the
-					 * connection closed.
-					 */
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-				}
-				else
-				{
-					if (n_connected == quorum)
-					{
-						propTerm++;
-						/* prepare voting message */
-						voteRequest = (VoteRequest)
-						{
-							.tag = 'v',
-								.term = propTerm
-						};
-						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
-					}
+			wk->currMsg = wk->currMsg->next;
+			wk->flushWrite = false;
+		}
 
-					/*
-					 * Now send voting request to the cohort and wait
-					 * responses
-					 */
-					for (int j = 0; j < n_walkeepers; j++)
-					{
-						/*
-						 * Remember: SS_VOTING indicates that the walkeeper is
-						 * participating in voting, but hasn't sent anything
-						 * yet. The ones that have sent something are given
-						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
-						 */
-						if (walkeeper[j].state == SS_VOTING)
-						{
-							walkeeper[j].state = SS_SEND_VOTE;
-							/* Immediately send info */
-							AdvancePollState(j, WL_NO_EVENTS);
-						}
-					}
-				}
-				break;
+		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
+											* receiving of this
+											* message */
 
-				/*
-				 * Voting is an idle state - we don't expect any events to
-				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
-				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
-				 */
-			case SS_VOTING:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+		wk->ackMsg = wk->ackMsg->next;
+		readAnything = true;
+	}
 
-				/* We have quorum for voting, send our vote request */
-			case SS_SEND_VOTE:
-				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
-				/* On failure, logging & resetting is handled */
-				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
-					return;
+	if (!readAnything)
+		return wk->state == SS_ACTIVE;
 
-				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
-				break;
+	HandleWalKeeperResponse();
 
-				/* Start reading the walkeeper response for our candidate */
-			case SS_WAIT_VERDICT:
-				wk->voteResponse.apm.tag = 'v';
-				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
-					return;
+	/*
+	 * Also send the new commit lsn to all the walkeepers.
+	 *
+	 * FIXME: This is redundant for walkeepers that have other
+	 * outbound messages pending.
+	 */
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	if (minQuorumLsn > lastSentCommitLsn)
+	{
+		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+		lastSentCommitLsn = minQuorumLsn;
+	}
 
-				elog(LOG,
-					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
-					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
-					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+	return wk->state == SS_ACTIVE;
+}
 
-				/*
-				 * In case of acceptor rejecting our vote, bail out, but only
-				 * if either it already lives in strictly higher term
-				 * (concurrent compute spotted) or we are not elected yet and
-				 * thus need the vote.
-				 */
-				if ((!wk->voteResponse.voteGiven) &&
-					(wk->voteResponse.term > propTerm || n_votes < quorum))
-				{
-					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-						 wk->host, wk->port,
-						 wk->voteResponse.term, propTerm);
-				}
-				Assert(wk->voteResponse.term == propTerm);
+/*
+ * Combine hot standby feedbacks from all walkeepers.
+ */
+static void
+CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
+{
+	hs->ts = 0;
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
 
-				/* Handshake completed, do we have quorum? */
-				n_votes++;
-				if (n_votes < quorum)
-				{
-					wk->state = SS_IDLE; /* can't do much yet, no quorum */
-				}
-				else if (n_votes > quorum)
-				{
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.hs.ts != 0)
+		{
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
+			{
+				hs->xmin = walkeeper[i].feedback.hs.xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
+			{
+				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
+				hs->ts = walkeeper[i].feedback.hs.ts;
+			}
+		}
+	}
+}
 
-					/* recovery already performed, just start streaming */
-					SendProposerElected(wk);
-				}
-				else
-				{
-					wk->state = SS_IDLE;
-					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
-															 * read-ready */
+/*
+ * Get minimum of disk consistent LSNs of all safekeepers
+ */
+static XLogRecPtr
+CalculateDiskConsistentLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
+		{
+			lsn = walkeeper[i].feedback.diskConsistentLsn;
+		}
+	}
+	return lsn;
+}
 
-					DetermineEpochStartLsn();
+/*
+ * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
+ * last WAL record that can be safely discarded.
+ */
+static XLogRecPtr
+CalculateMinFlushLsn(void)
+{
+	XLogRecPtr lsn = UnknownXLogRecPtr;
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].feedback.flushLsn < lsn)
+			lsn = walkeeper[i].feedback.flushLsn;
+	}
+	return lsn;
+}
 
-					/*
-					 * Check if not all safekeepers are up-to-date, we need to
-					 * download WAL needed to synchronize them
-					 */
-					if (truncateLsn < propEpochStartLsn)
-					{
-						elog(LOG,
-							 "start recovery because truncateLsn=%X/%X is not "
-							 "equal to epochStartLsn=%X/%X",
-							 LSN_FORMAT_ARGS(truncateLsn),
-							 LSN_FORMAT_ARGS(propEpochStartLsn));
-						/* Perform recovery */
-						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
-							elog(FATAL, "Failed to recover state");
-					}
-					else if (syncSafekeepers)
-					{
-						/* Sync is not needed: just exit */
-						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-						exit(0);
-					}
+/*
+ * Calculate WAL position acknowledged by quorum
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(void)
+{
+	XLogRecPtr	responses[MAX_WALKEEPERS];
 
-					for (int i = 0; i < n_walkeepers; i++)
-					{
-						if (walkeeper[i].state == SS_IDLE)
-							SendProposerElected(&walkeeper[i]);
-					}
+	/*
+	 * Sort acknowledged LSNs
+	 */
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		/*
+		 * Like in Raft, we aren't allowed to commit entries from previous
+		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 */
+		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
+			walkeeper[i].feedback.flushLsn : 0;
+	}
+	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
 
-					/* 
-					 * The proposer has been elected, and there will be no quorum waiting
-					 * after this point. There will be no safekeeper with state SS_IDLE
-					 * also, because that state is used only for quorum waiting.
-					 */
+	/*
+	 * Get the smallest LSN committed by quorum
+	 */
+	return responses[n_walkeepers - quorum];
+}
 
-					if (syncSafekeepers)
-					{
-						/*
-						 * Queue empty message to enforce receiving feedback
-						 * even from nodes who are fully recovered; this is
-						 * required to learn they switched epoch which finishes
-						 * sync-safeekepers who doesn't generate any real new
-						 * records. Will go away once we switch to async acks.
-						 */
-						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+static void
+HandleWalKeeperResponse(void)
+{
+	HotStandbyFeedback hsFeedback;
+	XLogRecPtr	minQuorumLsn;
+	XLogRecPtr	diskConsistentLsn;
+	XLogRecPtr  minFlushLsn;
 
-						/* keep polling until all walkeepers are synced */
-						return;
-					}
+	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
+	diskConsistentLsn = CalculateDiskConsistentLsn();
 
-					WalProposerStartStreaming(propEpochStartLsn);
-					/* Should not return here */
-				}
+	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
+	{
 
-				break;
+		if (minQuorumLsn > lastFeedback.flushLsn)
+			lastFeedback.flushLsn = minQuorumLsn;
 
-			/* Flush proposer announcement message */
-			case SS_SEND_ELECTED_FLUSH:
+		lastFeedback.diskConsistentLsn = diskConsistentLsn;
 
-				/*
-				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll
-				 * wait until the next poll comes along.
-				 */
-				if (!AsyncFlush(wk))
-					return;
-				
-				StartStreaming(wk);
+		/* advance the replication slot */
+		if (!syncSafekeepers)
+			ProcessStandbyReply(
+								// write_lsn -  This is what durably stored in WAL service.
+								lastFeedback.flushLsn,
+								//flush_lsn - This is what durably stored in WAL service.
+								lastFeedback.flushLsn,
+								//apply_lsn - This is what processed and durably saved at pageserver.
+								lastFeedback.diskConsistentLsn,
+								GetCurrentTimestamp(), false);
+	}
 
-				break;
+	CombineHotStanbyFeedbacks(&hsFeedback);
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
+	{
+		lastFeedback.hs = hsFeedback;
+		if (!syncSafekeepers)
+			ProcessStandbyHSFeedback(hsFeedback.ts,
+									 XidFromFullTransactionId(hsFeedback.xmin),
+									 EpochFromFullTransactionId(hsFeedback.xmin),
+									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+	}
 
+	/*
+	 * Try to advance truncateLsn to minFlushLsn, which is the last record
+	 * flushed to all safekeepers. We must always start streaming from the 
+	 * beginning of the record, which simplifies decoding on the far end.
+	 *
+	 * Advanced truncateLsn should be not further than nearest commitLsn.
+	 * This prevents surprising violation of truncateLsn <= commitLsn
+	 * invariant which might occur because 1) truncateLsn can be advanced
+	 * immediately once chunk is broadcast to all safekeepers, and
+	 * commitLsn generally can't be advanced based on feedback from
+	 * safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2) chunks we
+	 * read from WAL and send are plain sheets of bytes, but safekeepers
+	 * ack only on record boundaries.
+	 */
+	minFlushLsn = CalculateMinFlushLsn();
+	if (minFlushLsn > truncateLsn)
+		truncateLsn = minFlushLsn;
 
-				/*
-				 * Idle state for sending WAL. Moved out only by calls to
-				 * SendMessageToNode
-				 */
-			case SS_IDLE:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	{
+		WalMessage *msg = msgQueueHead;
+		msgQueueHead = msg->next;
 
+		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
+		free(msg);
+	}
+	if (!msgQueueHead)			/* queue is empty */
+		msgQueueTail = NULL;
+	/* truncateLsn always points to the first chunk in the queue */
+	if (msgQueueHead)
+	{
+		/* Max takes care of special 0-sized messages */
+		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
+			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
+	}
 
-			case SS_ACTIVE:
-				if (events & WL_SOCKET_WRITEABLE)
-					if (!SendAppendRequests(wk))
-						return;
+	/*
+	 * Generally sync is done when majority switched the epoch so we committed
+	 * epochStartLsn and made the majority aware of it, ensuring they are
+	 * ready to give all WAL to pageserver. It would mean whichever majority
+	 * is alive, there will be at least one safekeeper who is able to stream
+	 * WAL to pageserver to make basebackup possible. However, since at the
+	 * moment we don't have any good mechanism of defining the healthy and
+	 * most advanced safekeeper who should push the wal into pageserver and
+	 * basically the random one gets connected, to prevent hanging basebackup
+	 * (due to pageserver connecting to not-synced-walkeeper) we currently
+	 * wait for all seemingly alive walkeepers to get synced.
+	 */
+	if (syncSafekeepers)
+	{
+		int			n_synced;
 
-				if (events & WL_SOCKET_READABLE)
-					if (!RecvAppendResponses(wk))
-						return;
+		n_synced = 0;
+		for (int i = 0; i < n_walkeepers; i++)
+		{
+			WalKeeper  *wk = &walkeeper[i];
+			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
 
-				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
-				break;
+			/* alive safekeeper which is not synced yet; wait for it */
+			if (wk->state != SS_OFFLINE && !synced)
+				return;
+			if (synced)
+				n_synced++;
+		}
+		if (n_synced >= quorum)
+		{
+			/* All walkeepers synced! */
+			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+			exit(0);
 		}
-
-		/*
-		 * We've already done something for these events - don't attempt more
-		 * states than we need to.
-		 */
-		events = WL_NO_EVENTS;
 	}
 }
 
@@ -2078,29 +2135,3 @@ AsyncFlush(WalKeeper *wk)
 			return false;
 	}
 }
-
-/*
- * WalProposerRegister
- *		Register a background worker porposing WAL to wal acceptors
- */
-void
-WalProposerRegister(void)
-{
-	BackgroundWorker bgw;
-
-	if (*wal_acceptors_list == '\0')
-		return;
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}

From 326dbd77361256f41102a6fbae358df93090092d Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 4 Jan 2022 13:29:06 +0300
Subject: [PATCH 088/167] Simplify walproposer code (#114)

* Clean up walproposer states

* Migrate AsyncReadFixed to AsyncReadMessage

* Handle flushWrite better a bit

* Update SS_ACTIVE event set in single place

Now event set is updated only in the end of HandleActiveState, after
all handlers code was executed.

* Add comment on SS_ACTIVE write event

* Add TODO for SS_ACTIVE DesiredEvents
---
 src/backend/replication/walproposer.c       | 1025 ++++++++++---------
 src/backend/replication/walproposer_utils.c |   46 +-
 src/include/replication/walproposer.h       |   51 +-
 3 files changed, 554 insertions(+), 568 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index b307c79177d..14f300d110b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -110,7 +110,15 @@ static void ShutdownConnection(WalKeeper *wk);
 static void ResetConnection(WalKeeper *wk);
 static long TimeToReconnect(TimestampTz now);
 static void ReconnectWalKeepers(void);
-static void AdvancePollState(int i, uint32 events);
+static void AdvancePollState(WalKeeper *wk, uint32 events);
+static void HandleConnectionEvent(WalKeeper *wk);
+static void SendStartWALPush(WalKeeper *wk);
+static void RecvStartWALPushResult(WalKeeper *wk);
+static void SendProposerGreeting(WalKeeper *wk);
+static void RecvAcceptorGreeting(WalKeeper *wk);
+static void SendVoteRequest(WalKeeper *wk);
+static void RecvVoteResponse(WalKeeper *wk);
+static void HandleElectedProposer(void);
 static term_t GetHighestTerm(TermHistory *th);
 static term_t GetEpoch(WalKeeper *wk);
 static void DetermineEpochStartLsn(void);
@@ -122,6 +130,7 @@ static void SendMessageToNode(int i, WalMessage *msg);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
 static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void HandleActiveState(WalKeeper *wk, uint32 events);
 static bool SendAppendRequests(WalKeeper *wk);
 static bool RecvAppendResponses(WalKeeper *wk);
 static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
@@ -129,10 +138,9 @@ static XLogRecPtr CalculateDiskConsistentLsn(void);
 static XLogRecPtr CalculateMinFlushLsn(void);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
 static void HandleWalKeeperResponse(void);
-static bool AsyncRead(int i, char **buf, int *buf_size);
-static bool AsyncReadFixed(int i, void *value, size_t value_size);
-static bool AsyncReadMessage(int i, AcceptorProposerMessage *anymsg);
-static bool BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state);
+static bool AsyncRead(WalKeeper *wk, char **buf, int *buf_size);
+static bool AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg);
+static bool BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state);
 static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
 static bool AsyncFlush(WalKeeper *wk);
 
@@ -250,21 +258,19 @@ WalProposerPoll(void)
 	{
 		WalKeeper  *wk;
 		int			rc;
-		int			i;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
 
 		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
 							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
 		wk = (WalKeeper *) event.user_data;
-		i = (int) (wk - walkeeper);
 
 		/*
 		 * If the event contains something that one of our walkeeper states
 		 * was waiting for, we'll advance its state.
 		 */
 		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(i, event.events);
+			AdvancePollState(wk, event.events);
 
 		/*
 		 * If the timeout expired, attempt to reconnect to any walkeepers that
@@ -439,7 +445,7 @@ InitEventSet(void)
  *
  * This function is called any time the WAL keeper's state switches to one where
  * it has to wait to continue. This includes the full body of AdvancePollState
- * and each call to AsyncRead/BlockingWrite/AsyncWrite/AsyncFlush.
+ * and calls to IO helper functions.
  */
 static void
 UpdateEventSet(WalKeeper *wk, uint32 events)
@@ -640,453 +646,446 @@ ReconnectWalKeepers(void)
 }
 
 /*
- * Performs the logic for advancing the state machine of the 'i'th walkeeper,
+ * Performs the logic for advancing the state machine of the specified walkeeper,
  * given that a certain set of events has occured.
  */
 static void
-AdvancePollState(int i, uint32 events)
+AdvancePollState(WalKeeper *wk, uint32 events)
 {
-	WalKeeper  *wk = &walkeeper[i];
 	/*
-	 * Keep advancing the state while either: (a) the event is still
-	 * unprocessed (usually because it's the first iteration of the loop), or
-	 * (b) the state can execute, and does not need to wait for any socket
-	 * events
+	 * Sanity check. We assume further down that the operations don't
+	 * block because the socket is ready.
 	 */
-	while (events || StateShouldImmediatelyExecute(wk->state))
+	AssertEventsOkForState(events, wk);
+
+	/* Execute the code corresponding to the current state */
+	switch (wk->state)
 	{
-		/*
-		 * Sanity check. We assume further down that the operations don't
-		 * block because the socket is ready.
-		 */
-		AssertEventsOkForState(events, wk);
+			/*
+			 * WAL keepers are only taken out of SS_OFFLINE by calls to
+			 * ResetConnection
+			 */
+		case SS_OFFLINE:
+			elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
+					wk->host, wk->port);
+			break;			/* actually unreachable, but prevents
+							 * -Wimplicit-fallthrough */
 
-		/* Execute the code corresponding to the current state */
-		switch (wk->state)
-		{
-				/*
-				 * WAL keepers are only taken out of SS_OFFLINE by calls to
-				 * ResetConnection
-				 */
-			case SS_OFFLINE:
-				elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
-					 wk->host, wk->port);
-				break;			/* actually unreachable, but prevents
-								 * -Wimplicit-fallthrough */
+			/*
+			 * Both connecting states run the same logic. The only
+			 * difference is the events they're expecting
+			 */
+		case SS_CONNECTING_READ:
+		case SS_CONNECTING_WRITE:
+			HandleConnectionEvent(wk);
+			break;
 
-				/*
-				 * Both connecting states run the same logic. The only
-				 * difference is the events they're expecting
-				 */
-			case SS_CONNECTING_READ:
-			case SS_CONNECTING_WRITE:
-				{
-					WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
-
-					/* The new set of events we'll wait on, after updating */
-					uint32		new_events = WL_NO_EVENTS;
-
-					switch (result)
-					{
-						case WP_CONN_POLLING_OK:
-							elog(LOG, "connected with node %s:%s", wk->host,
-								 wk->port);
-
-							/*
-							 * Once we're fully connected, we can move to the
-							 * next state
-							 */
-							wk->state = SS_EXEC_STARTWALPUSH;
-
-							/*
-							 * Even though SS_EXEC_STARTWALPUSH doesn't wait
-							 * on anything, we do need to replace the current
-							 * event, so we have to just pick something. We'll
-							 * eventually need the socket to be readable, so
-							 * we go with that.
-							 */
-							new_events = WL_SOCKET_READABLE;
-							break;
-
-							/*
-							 * If we need to poll to finish connecting,
-							 * continue doing that
-							 */
-						case WP_CONN_POLLING_READING:
-							wk->state = SS_CONNECTING_READ;
-							new_events = WL_SOCKET_READABLE;
-							break;
-						case WP_CONN_POLLING_WRITING:
-							wk->state = SS_CONNECTING_WRITE;
-							new_events = WL_SOCKET_WRITEABLE;
-							break;
-
-						case WP_CONN_POLLING_FAILED:
-							elog(WARNING, "Failed to connect to node '%s:%s': %s",
-								 wk->host, wk->port, walprop_error_message(wk->conn));
-
-							/*
-							 * If connecting failed, we don't want to restart
-							 * the connection because that might run us into a
-							 * loop. Instead, shut it down -- it'll naturally
-							 * restart at a slower interval on calls to
-							 * ReconnectWalKeepers.
-							 */
-							ShutdownConnection(wk);
-							return;
-					}
-
-					/*
-					 * Because PQconnectPoll can change the socket, we have to
-					 * un-register the old event and re-register an event on
-					 * the new socket.
-					 */
-					HackyRemoveWalProposerEvent(wk);
-					wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
-					break;
-				}
+			/*
+			 * Waiting for a successful CopyBoth response.
+			 */
+		case SS_WAIT_EXEC_RESULT:
+			RecvStartWALPushResult(wk);
+			break;
 
-				/*
-				 * Send "START_WAL_PUSH" command to the walkeeper. After
-				 * sending, wait for response with SS_WAIT_EXEC_RESULT
-				 */
-			case SS_EXEC_STARTWALPUSH:
-				{
-					char *query = NULL;
-					if (zenith_pageserver_connstring_walproposer != NULL) {
-						query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
-					} else {
-						query = psprintf("START_WAL_PUSH");
-					}
-					if (!walprop_send_query(wk->conn, query))
-					{
-						pfree(query);
-						elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-							wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
-					}
-					pfree(query);
-					wk->state = SS_WAIT_EXEC_RESULT;
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-					break;
-				}
-
-			case SS_WAIT_EXEC_RESULT:
-				switch (walprop_get_query_result(wk->conn))
-				{
-						/*
-						 * Successful result, move on to starting the
-						 * handshake
-						 */
-					case WP_EXEC_SUCCESS_COPYBOTH:
-
-						/*
-						 * Because this state is immediately executable, we'll
-						 * start this on the next iteration of the loop
-						 */
-						wk->state = SS_HANDSHAKE_SEND;
-						break;
-
-						/*
-						 * Needs repeated calls to finish. Wait until the
-						 * socket is readable
-						 */
-					case WP_EXEC_NEEDS_INPUT:
-
-						/*
-						 * SS_WAIT_EXEC_RESULT is always reached through an
-						 * event, so we don't need to update the event set
-						 */
-						break;
-
-					case WP_EXEC_FAILED:
-						elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-							 wk->host, wk->port, walprop_error_message(wk->conn));
-						ShutdownConnection(wk);
-						return;
-
-						/*
-						 * Unexpected result -- funamdentally an error, but we
-						 * want to produce a custom message, rather than a
-						 * generic "something went wrong"
-						 */
-					case WP_EXEC_UNEXPECTED_SUCCESS:
-						elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
-							 wk->host, wk->port);
-						ShutdownConnection(wk);
-						return;
-				}
-				break;
+			/*
+			 * Finish handshake comms: receive information about the safekeeper.
+			 */
+		case SS_HANDSHAKE_RECV:
+			RecvAcceptorGreeting(wk);
+			break;
 
-				/*
-				 * Start handshake: first of all send information about the
-				 * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
-				 * a response to finish the handshake.
-				 */
-			case SS_HANDSHAKE_SEND:
+			/*
+			 * Voting is an idle state - we don't expect any events to trigger.
+			 * Refer to the execution of SS_HANDSHAKE_RECV to see how nodes are
+			 * transferred from SS_VOTING to sending actual vote requests.
+			 */
+		case SS_VOTING:
+			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					wk->port, FormatWalKeeperState(wk->state));
+			ResetConnection(wk);
+			return;
 
-				/*
-				 * On failure, logging & resetting the connection is handled.
-				 * We just need to handle the control flow.
-				 */
-				if (!BlockingWrite(i, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV))
-					return;
+			/* Read the safekeeper response for our candidate */
+		case SS_WAIT_VERDICT:
+			RecvVoteResponse(wk);
+			break;
 
-				break;
+			/* Flush proposer announcement message */
+		case SS_SEND_ELECTED_FLUSH:
 
-				/*
-				 * Finish handshake comms: receive information about the WAL
-				 * keeper
-				 */
-			case SS_HANDSHAKE_RECV:
+			/*
+			 * AsyncFlush ensures we only move on to SS_ACTIVE once the flush
+			 * completes. If we still have more to do, we'll wait until the next
+			 * poll comes along.
+			 */
+			if (!AsyncFlush(wk))
+				return;
+			
+			/* flush is done, event set and state will be updated later */
+			StartStreaming(wk);
+			break;
 
-				/*
-				 * If our reading doesn't immediately succeed, any necessary
-				 * error handling or state setting is taken care of. We can
-				 * leave any other work until later.
-				 */
-				if (!AsyncReadFixed(i, &wk->greet, sizeof(wk->greet)))
-					return;
+			/*
+			 * Idle state for waiting votes from quorum.
+			 */
+		case SS_IDLE:
+			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
+					wk->port, FormatWalKeeperState(wk->state));
+			ResetConnection(wk);
+			return;
 
-				/* Protocol is all good, move to voting. */
-				wk->state = SS_VOTING;
+			/*
+			 * Active state is used for streaming WAL and receiving feedback.
+			 */
+		case SS_ACTIVE:
+			HandleActiveState(wk, events);
+			break;
+	}
+}
 
-				/*
-				 * Don't need to update the event set yet. Either we update
-				 * the event set to WL_SOCKET_READABLE *or* we change the
-				 * state to SS_SEND_VOTE in the loop below
-				 */
-				UpdateEventSet(wk, WL_SOCKET_READABLE);
-				wk->feedback.flushLsn = truncateLsn;
-				wk->feedback.hs.ts = 0;
+static void
+HandleConnectionEvent(WalKeeper *wk)
+{
+	WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
 
-				/*
-				 * We want our term to be highest and unique, so choose max
-				 * and +1 once we have majority.
-				 */
-				propTerm = Max(walkeeper[i].greet.term, propTerm);
+	/* The new set of events we'll wait on, after updating */
+	uint32		new_events = WL_NO_EVENTS;
 
-				/*
-				 * Check if we have quorum. If there aren't enough walkeepers,
-				 * wait and do nothing. We'll eventually get a task when the
-				 * election starts.
-				 *
-				 * If we do have quorum, we can start an election
-				 */
-				if (++n_connected < quorum)
-				{
-					/*
-					 * SS_VOTING is an idle state; read-ready indicates the
-					 * connection closed.
-					 */
-					UpdateEventSet(wk, WL_SOCKET_READABLE);
-				}
-				else
-				{
-					if (n_connected == quorum)
-					{
-						propTerm++;
-						/* prepare voting message */
-						voteRequest = (VoteRequest)
-						{
-							.tag = 'v',
-								.term = propTerm
-						};
-						memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
-					}
-
-					/*
-					 * Now send voting request to the cohort and wait
-					 * responses
-					 */
-					for (int j = 0; j < n_walkeepers; j++)
-					{
-						/*
-						 * Remember: SS_VOTING indicates that the walkeeper is
-						 * participating in voting, but hasn't sent anything
-						 * yet. The ones that have sent something are given
-						 * SS_SEND_VOTE or SS_WAIT_VERDICT.
-						 */
-						if (walkeeper[j].state == SS_VOTING)
-						{
-							walkeeper[j].state = SS_SEND_VOTE;
-							/* Immediately send info */
-							AdvancePollState(j, WL_NO_EVENTS);
-						}
-					}
-				}
-				break;
+	switch (result)
+	{
+		case WP_CONN_POLLING_OK:
+			elog(LOG, "connected with node %s:%s", wk->host,
+					wk->port);
 
-				/*
-				 * Voting is an idle state - we don't expect any events to
-				 * trigger. Refer to the execution of SS_HANDSHAKE_RECV to see
-				 * how nodes are transferred from SS_VOTING to SS_SEND_VOTE.
-				 */
-			case SS_VOTING:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+			/*
+			 * We have to pick some event to update event set.
+			 * We'll eventually need the socket to be readable,
+			 * so we go with that.
+			 */
+			new_events = WL_SOCKET_READABLE;
+			break;
+
+			/*
+			 * If we need to poll to finish connecting,
+			 * continue doing that
+			 */
+		case WP_CONN_POLLING_READING:
+			wk->state = SS_CONNECTING_READ;
+			new_events = WL_SOCKET_READABLE;
+			break;
+		case WP_CONN_POLLING_WRITING:
+			wk->state = SS_CONNECTING_WRITE;
+			new_events = WL_SOCKET_WRITEABLE;
+			break;
 
-				/* We have quorum for voting, send our vote request */
-			case SS_SEND_VOTE:
-				elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
-				/* On failure, logging & resetting is handled */
-				if (!BlockingWrite(i, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
-					return;
+		case WP_CONN_POLLING_FAILED:
+			elog(WARNING, "Failed to connect to node '%s:%s': %s",
+					wk->host, wk->port, walprop_error_message(wk->conn));
 
-				/* If successful, wait for read-ready with SS_WAIT_VERDICT */
-				break;
+			/*
+			 * If connecting failed, we don't want to restart
+			 * the connection because that might run us into a
+			 * loop. Instead, shut it down -- it'll naturally
+			 * restart at a slower interval on calls to
+			 * ReconnectWalKeepers.
+			 */
+			ShutdownConnection(wk);
+			return;
+	}
 
-				/* Start reading the walkeeper response for our candidate */
-			case SS_WAIT_VERDICT:
-				wk->voteResponse.apm.tag = 'v';
-				if (!AsyncReadMessage(i, (AcceptorProposerMessage *) &wk->voteResponse))
-					return;
+	/*
+	 * Because PQconnectPoll can change the socket, we have to
+	 * un-register the old event and re-register an event on
+	 * the new socket.
+	 */
+	HackyRemoveWalProposerEvent(wk);
+	wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
 
-				elog(LOG,
-					 "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-					 wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
-					 LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
-					 LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+	/* If we successfully connected, send START_WAL_PUSH query */
+	if (result == WP_CONN_POLLING_OK)
+		SendStartWALPush(wk);
+}
 
-				/*
-				 * In case of acceptor rejecting our vote, bail out, but only
-				 * if either it already lives in strictly higher term
-				 * (concurrent compute spotted) or we are not elected yet and
-				 * thus need the vote.
-				 */
-				if ((!wk->voteResponse.voteGiven) &&
-					(wk->voteResponse.term > propTerm || n_votes < quorum))
-				{
-					elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-						 wk->host, wk->port,
-						 wk->voteResponse.term, propTerm);
-				}
-				Assert(wk->voteResponse.term == propTerm);
-
-				/* Handshake completed, do we have quorum? */
-				n_votes++;
-				if (n_votes < quorum)
-				{
-					wk->state = SS_IDLE; /* can't do much yet, no quorum */
-				}
-				else if (n_votes > quorum)
-				{
-
-					/* recovery already performed, just start streaming */
-					SendProposerElected(wk);
-				}
-				else
-				{
-					wk->state = SS_IDLE;
-					UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
-															 * read-ready */
-
-					DetermineEpochStartLsn();
-
-					/*
-					 * Check if not all safekeepers are up-to-date, we need to
-					 * download WAL needed to synchronize them
-					 */
-					if (truncateLsn < propEpochStartLsn)
-					{
-						elog(LOG,
-							 "start recovery because truncateLsn=%X/%X is not "
-							 "equal to epochStartLsn=%X/%X",
-							 LSN_FORMAT_ARGS(truncateLsn),
-							 LSN_FORMAT_ARGS(propEpochStartLsn));
-						/* Perform recovery */
-						if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
-							elog(FATAL, "Failed to recover state");
-					}
-					else if (syncSafekeepers)
-					{
-						/* Sync is not needed: just exit */
-						fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
-						exit(0);
-					}
-
-					for (int i = 0; i < n_walkeepers; i++)
-					{
-						if (walkeeper[i].state == SS_IDLE)
-							SendProposerElected(&walkeeper[i]);
-					}
-
-					/* 
-					 * The proposer has been elected, and there will be no quorum waiting
-					 * after this point. There will be no safekeeper with state SS_IDLE
-					 * also, because that state is used only for quorum waiting.
-					 */
-
-					if (syncSafekeepers)
-					{
-						/*
-						 * Queue empty message to enforce receiving feedback
-						 * even from nodes who are fully recovered; this is
-						 * required to learn they switched epoch which finishes
-						 * sync-safeekepers who doesn't generate any real new
-						 * records. Will go away once we switch to async acks.
-						 */
-						BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
-
-						/* keep polling until all walkeepers are synced */
-						return;
-					}
-
-					WalProposerStartStreaming(propEpochStartLsn);
-					/* Should not return here */
-				}
+/*
+ * Send "START_WAL_PUSH" message as an empty query to the walkeeper. Performs
+ * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something
+ * goes wrong, change state to SS_OFFLINE and shutdown the connection.
+ */
+static void
+SendStartWALPush(WalKeeper *wk)
+{
+	char *query = NULL;
+	if (zenith_pageserver_connstring_walproposer != NULL) {
+		query = psprintf("START_WAL_PUSH %s", zenith_pageserver_connstring_walproposer);
+	} else {
+		query = psprintf("START_WAL_PUSH");
+	}
+	if (!walprop_send_query(wk->conn, query))
+	{
+		pfree(query);
+		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
+			wk->host, wk->port, walprop_error_message(wk->conn));
+		ShutdownConnection(wk);
+		return;
+	}
+	pfree(query);
+	wk->state = SS_WAIT_EXEC_RESULT;
+	UpdateEventSet(wk, WL_SOCKET_READABLE);
+}
 
-				break;
+static void
+RecvStartWALPushResult(WalKeeper *wk)
+{
+	switch (walprop_get_query_result(wk->conn))
+	{
+			/*
+			 * Successful result, move on to starting the
+			 * handshake
+			 */
+		case WP_EXEC_SUCCESS_COPYBOTH:
 
-			/* Flush proposer announcement message */
-			case SS_SEND_ELECTED_FLUSH:
+			SendProposerGreeting(wk);
+			break;
 
-				/*
-				 * AsyncFlush ensures we only move on to SS_RECV_FEEDBACK once
-				 * the flush completes. If we still have more to do, we'll
-				 * wait until the next poll comes along.
-				 */
-				if (!AsyncFlush(wk))
-					return;
-				
-				StartStreaming(wk);
+			/*
+			 * Needs repeated calls to finish. Wait until the
+			 * socket is readable
+			 */
+		case WP_EXEC_NEEDS_INPUT:
 
-				break;
+			/*
+			 * SS_WAIT_EXEC_RESULT is always reached through an
+			 * event, so we don't need to update the event set
+			 */
+			break;
 
+		case WP_EXEC_FAILED:
+			elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
+					wk->host, wk->port, walprop_error_message(wk->conn));
+			ShutdownConnection(wk);
+			return;
 
-				/*
-				 * Idle state for sending WAL. Moved out only by calls to
-				 * SendMessageToNode
-				 */
-			case SS_IDLE:
-				elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					 wk->port, FormatWalKeeperState(wk->state));
-				ResetConnection(wk);
-				break;
+			/*
+			 * Unexpected result -- funamdentally an error, but we
+			 * want to produce a custom message, rather than a
+			 * generic "something went wrong"
+			 */
+		case WP_EXEC_UNEXPECTED_SUCCESS:
+			elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
+					wk->host, wk->port);
+			ShutdownConnection(wk);
+			return;
+	}
+}
+
+/*
+ * Start handshake: first of all send information about the
+ * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * a response to finish the handshake.
+ */
+static void
+SendProposerGreeting(WalKeeper *wk)
+{
+	/*
+	 * On failure, logging & resetting the connection is handled.
+	 * We just need to handle the control flow.
+	 */
+	BlockingWrite(wk, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV);
+}
 
+static void
+RecvAcceptorGreeting(WalKeeper *wk)
+{
+	/*
+	 * If our reading doesn't immediately succeed, any necessary
+	 * error handling or state setting is taken care of. We can
+	 * leave any other work until later.
+	 */
+	wk->greet.apm.tag = 'g';
+	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->greet))
+		return;
 
-			case SS_ACTIVE:
-				if (events & WL_SOCKET_WRITEABLE)
-					if (!SendAppendRequests(wk))
-						return;
+	/* Protocol is all good, move to voting. */
+	wk->state = SS_VOTING;
+	wk->feedback.flushLsn = truncateLsn;
+	wk->feedback.hs.ts = 0;
 
-				if (events & WL_SOCKET_READABLE)
-					if (!RecvAppendResponses(wk))
-						return;
+	/*
+	 * We want our term to be highest and unique, so choose max
+	 * and +1 once we have majority.
+	 */
+	propTerm = Max(wk->greet.term, propTerm);
 
-				UpdateEventSet(wk, WL_SOCKET_READABLE | (wk->currMsg == NULL ? 0 : WL_SOCKET_WRITEABLE));
-				break;
+	/*
+	 * Check if we have quorum. If there aren't enough safekeepers,
+	 * wait and do nothing. We'll eventually get a task when the
+	 * election starts.
+	 *
+	 * If we do have quorum, we can start an election
+	 */
+	if (++n_connected < quorum)
+	{
+		/*
+		 * SS_VOTING is an idle state; read-ready indicates the
+		 * connection closed.
+		 */
+		UpdateEventSet(wk, WL_SOCKET_READABLE);
+	}
+	else
+	{
+		if (n_connected == quorum)
+		{
+			propTerm++;
+			/* prepare voting message */
+			voteRequest = (VoteRequest)
+			{
+				.tag = 'v',
+					.term = propTerm
+			};
+			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
 		}
 
 		/*
-		 * We've already done something for these events - don't attempt more
-		 * states than we need to.
+		 * Now send voting request to the cohort and wait
+		 * responses
 		 */
-		events = WL_NO_EVENTS;
+		for (int j = 0; j < n_walkeepers; j++)
+		{
+			/*
+			 * Remember: SS_VOTING indicates that the safekeeper is
+			 * participating in voting, but hasn't sent anything
+			 * yet.
+			 */
+			if (walkeeper[j].state == SS_VOTING)
+				SendVoteRequest(&walkeeper[j]);
+		}
+	}
+}
+
+static void
+SendVoteRequest(WalKeeper *wk)
+{
+	/* We have quorum for voting, send our vote request */
+	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
+	/* On failure, logging & resetting is handled */
+	if (!BlockingWrite(wk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+		return;
+
+	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
+}
+
+static void
+RecvVoteResponse(WalKeeper *wk)
+{
+	wk->voteResponse.apm.tag = 'v';
+	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->voteResponse))
+		return;
+
+	elog(LOG,
+			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+			wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
+			LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
+			LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+
+	/*
+	 * In case of acceptor rejecting our vote, bail out, but only
+	 * if either it already lives in strictly higher term
+	 * (concurrent compute spotted) or we are not elected yet and
+	 * thus need the vote.
+	 */
+	if ((!wk->voteResponse.voteGiven) &&
+		(wk->voteResponse.term > propTerm || n_votes < quorum))
+	{
+		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+				wk->host, wk->port,
+				wk->voteResponse.term, propTerm);
+	}
+	Assert(wk->voteResponse.term == propTerm);
+
+	/* Handshake completed, do we have quorum? */
+	n_votes++;
+	if (n_votes < quorum)
+	{
+		wk->state = SS_IDLE; /* can't do much yet, no quorum */
+	}
+	else if (n_votes > quorum)
+	{
+		/* recovery already performed, just start streaming */
+		SendProposerElected(wk);
+	}
+	else
+	{
+		wk->state = SS_IDLE;
+		UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+												 * read-ready */
+
+		HandleElectedProposer();
+	}
+}
+
+/*
+ * Called once a majority of acceptors have voted for us and current proposer
+ * has been elected.
+ * 
+ * Sends ProposerElected message to all acceptors in SS_IDLE state and starts
+ * replication from walsender.
+ */
+static void
+HandleElectedProposer(void)
+{
+	DetermineEpochStartLsn();
+
+	/*
+	 * Check if not all safekeepers are up-to-date, we need to
+	 * download WAL needed to synchronize them
+	 */
+	if (truncateLsn < propEpochStartLsn)
+	{
+		elog(LOG,
+				"start recovery because truncateLsn=%X/%X is not "
+				"equal to epochStartLsn=%X/%X",
+				LSN_FORMAT_ARGS(truncateLsn),
+				LSN_FORMAT_ARGS(propEpochStartLsn));
+		/* Perform recovery */
+		if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
+			elog(FATAL, "Failed to recover state");
+	}
+	else if (syncSafekeepers)
+	{
+		/* Sync is not needed: just exit */
+		fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
+		exit(0);
+	}
+
+	for (int i = 0; i < n_walkeepers; i++)
+	{
+		if (walkeeper[i].state == SS_IDLE)
+			SendProposerElected(&walkeeper[i]);
+	}
+
+	/* 
+	 * The proposer has been elected, and there will be no quorum waiting
+	 * after this point. There will be no safekeeper with state SS_IDLE
+	 * also, because that state is used only for quorum waiting.
+	 */
+
+	if (syncSafekeepers)
+	{
+		/*
+			* Queue empty message to enforce receiving feedback
+			* even from nodes who are fully recovered; this is
+			* required to learn they switched epoch which finishes
+			* sync-safeekepers who doesn't generate any real new
+			* records. Will go away once we switch to async acks.
+			*/
+		BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+
+		/* keep polling until all walkeepers are synced */
+		return;
 	}
+
+	WalProposerStartStreaming(propEpochStartLsn);
+	/* Should not return here */
 }
 
 /* latest term in TermHistory, or 0 is there is no entries */
@@ -1372,7 +1371,8 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 }
 
 /*
- * Start streaming to safekeeper wk, always updates state to SS_ACTIVE.
+ * Start streaming to safekeeper wk, always updates state to SS_ACTIVE and sets
+ * correct event set.
  */
 static void
 StartStreaming(WalKeeper *wk)
@@ -1384,7 +1384,6 @@ StartStreaming(WalKeeper *wk)
 	 * exactly once for a connection.
 	 */
 	wk->state = SS_ACTIVE;
-	UpdateEventSet(wk, WL_SOCKET_READABLE);
 
 	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
@@ -1395,17 +1394,21 @@ StartStreaming(WalKeeper *wk)
 		}
 		else
 		{
+			/* event set will be updated inside SendMessageToNode */
 			SendMessageToNode(wki, msg);
 			return;
 		}
 	}
+
+	/* Call SS_ACTIVE handler to update event set */
+	HandleActiveState(wk, WL_NO_EVENTS);
 }
 
 /*
- * Start sending message to the particular node.
+ * Start sending message to the particular node. Always updates event set.
  *
- * Always updates the state and event set for the WAL keeper; setting either of
- * these before calling would be redundant work.
+ * Can be used only for safekeepers in SS_ACTIVE state. State can be changed
+ * in case of errors.
  */
 static void
 SendMessageToNode(int i, WalMessage *msg)
@@ -1414,6 +1417,7 @@ SendMessageToNode(int i, WalMessage *msg)
 
 	/* we shouldn't be already sending something */
 	Assert(wk->currMsg == NULL);
+	Assert(wk->state == SS_ACTIVE);
 
 	/*
 	 * Skip already acknowledged messages. Used after reconnection to get to
@@ -1423,11 +1427,9 @@ SendMessageToNode(int i, WalMessage *msg)
 		msg = msg->next;
 
 	wk->currMsg = msg;
-	wk->flushWrite = false;
 
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	if (!SendAppendRequests(wk))
-		return;
+	HandleActiveState(wk, WL_SOCKET_WRITEABLE);
 }
 
 /*
@@ -1527,9 +1529,40 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	return msg;
 }
 
+/*
+ * Process all events happened in SS_ACTIVE state, update event set after that.
+ */
+static void
+HandleActiveState(WalKeeper *wk, uint32 events)
+{
+	uint32 newEvents = WL_SOCKET_READABLE;
+
+	if (events & WL_SOCKET_WRITEABLE)
+		if (!SendAppendRequests(wk))
+			return;
+
+	if (events & WL_SOCKET_READABLE)
+		if (!RecvAppendResponses(wk))
+			return;
+
+	/*
+	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
+	 * in the buffer.
+	 * 
+	 * wk->currMsg checks if we have pending unsent messages. This check isn't
+	 * necessary now, because we always send queue messages immediately after
+	 * creation. But it's good to have it here in case we change this behavior
+	 * in the future.
+	 */
+	if (wk->currMsg != NULL || wk->flushWrite)
+		newEvents |= WL_SOCKET_WRITEABLE;
+
+	UpdateEventSet(wk, newEvents);
+}
+
 /*
  * Send queue messages starting from wk->currMsg until the end or non-writable
- * socket, whichever comes first.
+ * socket, whichever comes first. Caller should take care of updating event set.
  * 
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
@@ -1540,6 +1573,7 @@ SendAppendRequests(WalKeeper *wk)
 	int wki = wk - walkeeper;
 	WalMessage *msg;
 	AppendRequestHeader *req;
+	PGAsyncWriteResult writeResult;
 
 	if (wk->flushWrite)
 	{
@@ -1550,7 +1584,7 @@ SendAppendRequests(WalKeeper *wk)
 			 */
 			return wk->state == SS_ACTIVE;
 
-		wk->currMsg = wk->currMsg->next;
+		/* Event set will be updated in the end of HandleActiveState */
 		wk->flushWrite = false;
 	}
 
@@ -1604,24 +1638,39 @@ SendAppendRequests(WalKeeper *wk)
 		 * message is stored after the end of the WalMessage
 		 * struct, in the allocation for each msg
 		 */
-		if (!AsyncWrite(wk, req,
-						sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn,
-						SS_ACTIVE))
-		{
-			if (req != &msg->req)
-				free(req);
-			if (wk->state == SS_ACTIVE)
-			{
-				wk->flushWrite = true;
-				return true;
-			}
-			return false;
-		}
+		writeResult = walprop_async_write(wk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
+		
+		/* Free up resources */
 		if (req != &msg->req)
 			free(req);
 
-		/* continue writing the next message */
+		/* Mark current message as sent, whatever the result is */
 		wk->currMsg = wk->currMsg->next;
+
+		switch (writeResult)
+		{
+			case PG_ASYNC_WRITE_SUCCESS:
+				/* Continue writing the next message */
+				break;
+
+			case PG_ASYNC_WRITE_TRY_FLUSH:
+				/*
+				 * We still need to call PQflush some more to finish the job.
+				 * Caller function will handle this by setting right event set.
+				 */
+				wk->flushWrite = true;
+				return true;
+
+			case PG_ASYNC_WRITE_FAIL:
+				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
+					wk->host, wk->port, FormatWalKeeperState(wk->state),
+					walprop_error_message(wk->conn));
+				ShutdownConnection(wk);
+				return false;
+			default:
+				Assert(false);
+				return false;
+		}
 	}
 
 	return true;
@@ -1649,7 +1698,8 @@ RecvAppendResponses(WalKeeper *wk)
 		 * necessary error handling or state setting is taken care
 		 * of. We can leave any other work until later.
 		 */
-		if (!AsyncReadFixed(wki, &wk->feedback, sizeof(wk->feedback)))
+		wk->feedback.apm.tag = 'a';
+		if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->feedback))
 			break;
 
 		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
@@ -1657,18 +1707,9 @@ RecvAppendResponses(WalKeeper *wk)
 		/*
 		 * We shouldn't read responses ahead of wk->currMsg, because that will
 		 * look like we are receiving responses for messages that haven't been
-		 * sent yet. This can happen when message was placed in a buffer in 
-		 * SendAppendRequests, but sent through a wire only with a flush inside
-		 * AsyncReadFixed. In this case, we should move wk->currMsg.
+		 * sent yet.
 		 */
-		if (wk->ackMsg == wk->currMsg)
-		{
-			/* Couldn't happen without flush flag */
-			Assert(wk->flushWrite);
-
-			wk->currMsg = wk->currMsg->next;
-			wk->flushWrite = false;
-		}
+		Assert(wk->ackMsg != wk->currMsg);
 
 		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
 											* receiving of this
@@ -1911,10 +1952,8 @@ HandleWalKeeperResponse(void)
  * failure.
  */
 static bool
-AsyncRead(int i, char **buf, int *buf_size)
+AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
 {
-	WalKeeper  *wk = &walkeeper[i];
-
 	switch (walprop_async_read(wk->conn, buf, buf_size))
 	{
 		case PG_ASYNC_READ_SUCCESS:
@@ -1936,56 +1975,23 @@ AsyncRead(int i, char **buf, int *buf_size)
 }
 
 /*
- * Reads a CopyData block from the 'i'th WAL keeper's postgres connection,
- * returning whether the read was successful.
- *
+ * Read next message with known type into provided struct, by reading a CopyData
+ * block from the safekeeper's postgres connection, returning whether the read
+ * was successful.
+ * 
  * If the read needs more polling, we return 'false' and keep the state
  * unmodified, waiting until it becomes read-ready to try again. If it fully
  * failed, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncReadFixed(int i, void *value, size_t value_size)
+AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
 {
-	WalKeeper  *wk = &walkeeper[i];
-	char	   *buf = NULL;
-	int			buf_size = -1;
-
-	if (!(AsyncRead(i, &buf, &buf_size)))
-		return false;
-
-	/*
-	 * If we get here, the read was ok, but we still need to check it was the
-	 * right amount
-	 */
-	if ((size_t) buf_size != value_size)
-	{
-		elog(FATAL,
-			 "Unexpected walkeeper %s:%s read length from %s state. Expected %ld, found %d",
-			 wk->host, wk->port,
-			 FormatWalKeeperState(wk->state),
-			 value_size, buf_size);
-	}
-
-	/* Copy the resulting info into place */
-	memcpy(value, buf, buf_size);
-
-	return true;
-}
-
-/*
- * Read next message with known type into provided struct. 
- * TODO: migrate AsyncReadFixed here for all messages
- */
-static bool
-AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
-{
-	WalKeeper  *wk = &walkeeper[i];
 	char *buf;
 	int buf_size;
 	uint64 tag;
 	StringInfoData s;
 
-	if (!(AsyncRead(i, &buf, &buf_size)))
+	if (!(AsyncRead(wk, &buf, &buf_size)))
 		return false;
 
 	/* parse it */
@@ -2004,6 +2010,14 @@ AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
 
 	switch (tag)
 	{
+		case 'g':
+		{
+			AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
+			msg->term = pq_getmsgint64_le(&s);
+			pq_getmsgend(&s);
+			return true;
+		}
+
 		case 'v':
 		{
 			VoteResponse *msg = (VoteResponse *) anymsg;
@@ -2023,6 +2037,20 @@ AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
 			return true;
 		}
 
+		case 'a':
+		{
+			AppendResponse *msg = (AppendResponse *) anymsg;
+			msg->term = pq_getmsgint64_le(&s);
+			msg->flushLsn = pq_getmsgint64_le(&s);
+			msg->commitLsn = pq_getmsgint64_le(&s);
+			msg->diskConsistentLsn = pq_getmsgint64_le(&s);
+			msg->hs.ts = pq_getmsgint64_le(&s);
+			msg->hs.xmin.value = pq_getmsgint64_le(&s);
+			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+			pq_getmsgend(&s);
+			return true;
+		}
+
 		default:
 		{
 			Assert(false);
@@ -2038,9 +2066,8 @@ AsyncReadMessage(int i, AcceptorProposerMessage *anymsg)
  * single packet.
  */
 static bool
-BlockingWrite(int i, void *msg, size_t msg_size, WalKeeperState success_state)
+BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state)
 {
-	WalKeeper  *wk = &walkeeper[i];
 	uint32		events;
 
 	if (!walprop_blocking_write(wk->conn, msg, msg_size))
@@ -2105,7 +2132,9 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
  * Flushes a previous call to AsyncWrite. This only needs to be called when the
  * socket becomes read or write ready *after* calling AsyncWrite.
  *
- * If flushing successfully completes returns true, otherwise false.
+ * If flushing successfully completes returns true, otherwise false. Event set
+ * is updated only if connection fails, otherwise caller should manually unset
+ * WL_SOCKET_WRITEABLE.
  */
 static bool
 AsyncFlush(WalKeeper *wk)
@@ -2119,7 +2148,7 @@ AsyncFlush(WalKeeper *wk)
 	switch (walprop_flush(wk->conn))
 	{
 		case 0:
-			UpdateEventSet(wk, WL_SOCKET_READABLE); /* flush is done, unset write interest */
+			/* flush is done */
 			return true;
 		case 1:
 			/* Nothing to do; try again when the socket's ready */
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index c61ab87db45..74ea1cfd5b1 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -48,24 +48,15 @@ FormatWalKeeperState(WalKeeperState state)
 		case SS_CONNECTING_WRITE:
 			return_val = "connecting";
 			break;
-		case SS_EXEC_STARTWALPUSH:
-			return_val = "sending 'START_WAL_PUSH' query";
-			break;
 		case SS_WAIT_EXEC_RESULT:
 			return_val = "receiving query result";
 			break;
-		case SS_HANDSHAKE_SEND:
-			return_val = "handshake (sending)";
-			break;
 		case SS_HANDSHAKE_RECV:
 			return_val = "handshake (receiving)";
 			break;
 		case SS_VOTING:
 			return_val = "voting";
 			break;
-		case SS_SEND_VOTE:
-			return_val = "sending vote";
-			break;
 		case SS_WAIT_VERDICT:
 			return_val = "wait-for-verdict";
 			break;
@@ -140,19 +131,6 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 			result = WL_SOCKET_READABLE;
 			break;
 
-		/* Most writing states don't require any socket conditions */
-		case SS_EXEC_STARTWALPUSH:
-		case SS_HANDSHAKE_SEND:
-		case SS_SEND_VOTE:
-			result = WL_NO_EVENTS;
-			break;
-		/* but flushing does require read- or write-ready */
-		case SS_SEND_ELECTED_FLUSH:
-		/* Active state does both reading and writing to the socket */
-		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
-
 		/* Idle states use read-readiness as a sign that the connection has been
 		 * disconnected. */
 		case SS_VOTING:
@@ -160,6 +138,18 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 			result = WL_SOCKET_READABLE;
 			break;
 
+		/* 
+		 * Flush states require write-ready for flushing.
+		 * Active state does both reading and writing.
+		 * 
+		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
+		 * 	check wk->flushWrite here to set WL_SOCKET_WRITEABLE.
+		 */
+		case SS_SEND_ELECTED_FLUSH:
+		case SS_ACTIVE:
+			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			break;
+
 		/* The offline state expects no events. */
 		case SS_OFFLINE:
 			result = WL_NO_EVENTS;
@@ -169,16 +159,6 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 	return result;
 }
 
-/* Returns whether the WAL keeper state corresponds to something that should be
- * immediately executed -- i.e. it is not idle, and is not currently waiting. */
-bool
-StateShouldImmediatelyExecute(WalKeeperState state)
-{
-	/* This is actually pretty simple to determine. */
-	return WalKeeperStateDesiredEvents(state) == WL_NO_EVENTS
-		&& state != SS_OFFLINE;
-}
-
 /* Returns a human-readable string corresponding to the event set
  *
  * If the events do not correspond to something set as the `events` field of a `WaitEvent`, the
@@ -309,4 +289,4 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	enlargeStringInfo(buf, sizeof(uint64));
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
-}
\ No newline at end of file
+}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index ca27df2d19b..9506a6ee887 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -68,20 +68,12 @@ typedef enum
 } PGAsyncWriteResult;
 
 /*
- * WAL safekeeper state
+ * WAL safekeeper state, which is used to wait for some event.
  *
  * States are listed here in the order that they're executed.
  *
  * Most states, upon failure, will move back to SS_OFFLINE by calls to
  * ResetConnection or ShutdownConnection.
- *
- * Also note: In places we say that a state "immediately" moves to another. This
- * happens in states that only exist to execute program logic, so they run
- * exactly once (when moved into), without waiting for any socket conditions.
- *
- * For example, when we set a WalKeeper's state to SS_SEND_VOTE, we immediately
- * call AdvancePollState - during which the WalKeeper switches its state to
- * SS_WAIT_VERDICT.
  */
 typedef enum
 {
@@ -99,28 +91,18 @@ typedef enum
 	 * they execute when polled, but we have this distinction in order to
 	 * recreate the event set in HackyRemoveWalProposerEvent.
 	 *
-	 * After the connection is made, moves to SS_EXEC_STARTWALPUSH.
+	 * After the connection is made, "START_WAL_PUSH" query is sent.
 	 */
 	SS_CONNECTING_WRITE,
 	SS_CONNECTING_READ,
 
-	/*
-	 * Sending the "START_WAL_PUSH" message as an empty query to the walkeeper.
-	 * Performs a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT.
-	 */
-	SS_EXEC_STARTWALPUSH,
 	/*
 	 * Waiting for the result of the "START_WAL_PUSH" command.
 	 *
-	 * After we get a successful result, moves to SS_HANDSHAKE_SEND.
+	 * After we get a successful result, sends handshake to safekeeper.
 	 */
 	SS_WAIT_EXEC_RESULT,
 
-	/*
-	 * Executing the sending half of the handshake. Performs the blocking send,
-	 * then immediately moves to SS_HANDSHAKE_RECV.
-	 */
-	SS_HANDSHAKE_SEND,
 	/*
 	 * Executing the receiving half of the handshake. After receiving, moves to
 	 * SS_VOTING.
@@ -128,32 +110,28 @@ typedef enum
 	SS_HANDSHAKE_RECV,
 
 	/*
-	 * Currently participating in voting, but a quorum hasn't yet been reached.
+	 * Waiting to participate in voting, but a quorum hasn't yet been reached.
 	 * This is an idle state - we do not expect AdvancePollState to be called.
 	 *
-	 * Moved externally to SS_SEND_VOTE or SS_WAIT_VERDICT by execution of
-	 * SS_HANDSHAKE_RECV.
+	 * Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
+	 * quorum of handshakes.
 	 */
 	SS_VOTING,
-	/*
-	 * Performs a blocking send of the assigned vote, then immediately moves to
-	 * SS_WAIT_VERDICT.
-	 */
-	SS_SEND_VOTE,
+
 	/*
 	 * Already sent voting information, waiting to receive confirmation from the
-	 * node. After receiving, moves to SS_IDLE.
+	 * node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet.
 	 */
 	SS_WAIT_VERDICT,
 
-	/* need to flush ProposerAnnouncement */
+	/* Need to flush ProposerElected message. */
 	SS_SEND_ELECTED_FLUSH,
 
 	/*
 	 * Waiting for quorum to send WAL. Idle state. If the socket becomes
 	 * read-ready, the connection has been closed.
 	 *
-	 * Moves to SS_ACTIVE only by calls to SendMessageToNode.
+	 * Moves to SS_ACTIVE only by call to StartStreaming.
 	 */
 	SS_IDLE,
 
@@ -195,7 +173,7 @@ typedef struct AcceptorProposerMessage
  */
 typedef struct AcceptorGreeting
 {
-	uint64		tag;
+	AcceptorProposerMessage apm;
 	term_t		term;
 } AcceptorGreeting;
 
@@ -306,11 +284,11 @@ typedef struct HotStandbyFeedback
  */
 typedef struct AppendResponse
 {
+	AcceptorProposerMessage apm;
 	/*
 	 * Current term of the safekeeper; if it is higher than proposer's, the
 	 * compute is out of date.
 	 */
-	uint64 tag;
 	term_t     term;
 	// TODO: add comment
 	XLogRecPtr flushLsn;
@@ -341,8 +319,8 @@ typedef struct WalKeeper
 	WalProposerConn*   conn;
 	StringInfoData outbuf;
 
-	bool               flushWrite;    /* set to true if we wrote currMsg, but still need to call AsyncFlush */
-	WalMessage*        currMsg;       /* message been send to the receiver */
+	bool               flushWrite;    /* set to true if we need to call AsyncFlush, to flush pending messages */
+	WalMessage*        currMsg;       /* message that wasn't sent yet or NULL, if we have nothing to send */
 	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
@@ -361,7 +339,6 @@ int        CompareLsn(const void *a, const void *b);
 char*      FormatWalKeeperState(WalKeeperState state);
 void       AssertEventsOkForState(uint32 events, WalKeeper* wk);
 uint32     WalKeeperStateDesiredEvents(WalKeeperState state);
-bool       StateShouldImmediatelyExecute(WalKeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);

From 85b65e4a96abbf63f30ef7f8c3ed564fe048326a Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 4 Jan 2022 13:47:33 +0300
Subject: [PATCH 089/167] Don't change propTerm after quorum is acquired (#107)

---
 src/backend/replication/walproposer.c | 61 +++++++++++++++++----------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 14f300d110b..d6ff8ef4afa 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -79,16 +79,23 @@ static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
 static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
 										 * walkeepers */
 static ProposerGreeting proposerGreeting;
+static VoteRequest voteRequest; /* Vote request for walkeeper */
 static WaitEventSet *waitEvents;
 static AppendResponse lastFeedback;
 /*
- *  minimal LSN which may be needed for recovery of some safekeeper,
+ *  Minimal LSN which may be needed for recovery of some safekeeper,
  *  record-aligned (first record which might not yet received by someone).
  */
 static XLogRecPtr truncateLsn;
-static VoteRequest voteRequest; /* Vote request for walkeeper */
+/*
+ * Term of the proposer. We want our term to be highest and unique,
+ * so we collect terms from safekeepers quorum, choose max and +1.
+ * After that our term is fixed and must not change. If we observe
+ * that some safekeeper has higher term, it means that we have another
+ * running compute, so we must stop immediately.
+ */
+static term_t propTerm;
 static TermHistory propTermHistory; /* term history of the proposer */
-static term_t propTerm;			/* term of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
 static term_t donorEpoch;		/* Most advanced acceptor epoch */
 static int	donor;				/* Most advanced acceptor */
@@ -910,20 +917,42 @@ RecvAcceptorGreeting(WalKeeper *wk)
 	wk->feedback.flushLsn = truncateLsn;
 	wk->feedback.hs.ts = 0;
 
-	/*
-	 * We want our term to be highest and unique, so choose max
-	 * and +1 once we have majority.
-	 */
-	propTerm = Max(wk->greet.term, propTerm);
+	++n_connected;
+	if (n_connected <= quorum)
+	{
+		/* We're still collecting terms from the majority. */
+		propTerm = Max(wk->greet.term, propTerm);
+
+		/* Quorum is acquried, prepare the vote request. */
+		if (n_connected == quorum)
+		{
+			propTerm++;
+			elog(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, quorum, propTerm);
+
+			voteRequest = (VoteRequest)
+			{
+				.tag = 'v',
+					.term = propTerm
+			};
+			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+		}
+	}
+	else if (wk->greet.term > propTerm)
+	{
+		/* Another compute with higher term is running. */	
+		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+				wk->host, wk->port,
+				wk->greet.term, propTerm);
+	}
 
 	/*
 	 * Check if we have quorum. If there aren't enough safekeepers,
 	 * wait and do nothing. We'll eventually get a task when the
 	 * election starts.
 	 *
-	 * If we do have quorum, we can start an election
+	 * If we do have quorum, we can start an election.
 	 */
-	if (++n_connected < quorum)
+	if (n_connected < quorum)
 	{
 		/*
 		 * SS_VOTING is an idle state; read-ready indicates the
@@ -933,18 +962,6 @@ RecvAcceptorGreeting(WalKeeper *wk)
 	}
 	else
 	{
-		if (n_connected == quorum)
-		{
-			propTerm++;
-			/* prepare voting message */
-			voteRequest = (VoteRequest)
-			{
-				.tag = 'v',
-					.term = propTerm
-			};
-			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
-		}
-
 		/*
 		 * Now send voting request to the cohort and wait
 		 * responses

From 8e82eae4bfa2f657a36262a3b4f36c9a5b6774a4 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 5 Jan 2022 13:36:00 +0300
Subject: [PATCH 090/167] walproposer renames (#116)

* Rename walkeeper to safekeeper

* Rename message variables as request/response
---
 .../libpqwalproposer/libpqwalproposer.c       |   4 +-
 src/backend/replication/walproposer.c         | 758 +++++++++---------
 src/backend/replication/walproposer_utils.c   |  24 +-
 src/include/replication/walproposer.h         |  46 +-
 src/tools/pgindent/typedefs.list              |   2 +-
 5 files changed, 417 insertions(+), 417 deletions(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 177c93eb85d..f6714c08128 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -267,7 +267,7 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	 *     -2 if an error occured
 	 *  (> 0) if it was successful; that value is the amount transferred.
 	 *
-	 * The protocol we use between walproposer and walkeeper means that we
+	 * The protocol we use between walproposer and safekeeper means that we
 	 * *usually* wouldn't expect to see that the copy is done, but this can
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
@@ -280,7 +280,7 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 		{
 			/*
 			 * If we get -1, it's probably because of a server error; the
-			 * walkeeper won't normally send a CopyDone message.
+			 * safekeeper won't normally send a CopyDone message.
 			 *
 			 * We can check PQgetResult to make sure that the server failed;
 			 * it'll always result in PGRES_FATAL_ERROR
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d6ff8ef4afa..6f89c23eb2f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -69,19 +69,19 @@ WalProposerFunctionsType *WalProposerFunctions = NULL;
 
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
-static int	n_walkeepers = 0;
+static int	n_safekeepers = 0;
 static int	quorum = 0;
-static WalKeeper walkeeper[MAX_WALKEEPERS];
+static Safekeeper safekeeper[MAX_SAFEKEEPERS];
 static WalMessage *msgQueueHead;
 static WalMessage *msgQueueTail;
 static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
 								 * this point */
 static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
-										 * walkeepers */
-static ProposerGreeting proposerGreeting;
-static VoteRequest voteRequest; /* Vote request for walkeeper */
+										 * safekeepers */
+static ProposerGreeting greetRequest;
+static VoteRequest voteRequest; /* Vote request for safekeeper */
 static WaitEventSet *waitEvents;
-static AppendResponse lastFeedback;
+static AppendResponse quorumFeedback;
 /*
  *  Minimal LSN which may be needed for recovery of some safekeeper,
  *  record-aligned (first record which might not yet received by someone).
@@ -111,45 +111,45 @@ static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
 static void WalProposerLoop(void);
 static void InitEventSet(void);
-static void UpdateEventSet(WalKeeper *wk, uint32 events);
-static void HackyRemoveWalProposerEvent(WalKeeper *to_remove);
-static void ShutdownConnection(WalKeeper *wk);
-static void ResetConnection(WalKeeper *wk);
+static void UpdateEventSet(Safekeeper *sk, uint32 events);
+static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
+static void ShutdownConnection(Safekeeper *sk);
+static void ResetConnection(Safekeeper *sk);
 static long TimeToReconnect(TimestampTz now);
-static void ReconnectWalKeepers(void);
-static void AdvancePollState(WalKeeper *wk, uint32 events);
-static void HandleConnectionEvent(WalKeeper *wk);
-static void SendStartWALPush(WalKeeper *wk);
-static void RecvStartWALPushResult(WalKeeper *wk);
-static void SendProposerGreeting(WalKeeper *wk);
-static void RecvAcceptorGreeting(WalKeeper *wk);
-static void SendVoteRequest(WalKeeper *wk);
-static void RecvVoteResponse(WalKeeper *wk);
+static void ReconnectSafekeepers(void);
+static void AdvancePollState(Safekeeper *sk, uint32 events);
+static void HandleConnectionEvent(Safekeeper *sk);
+static void SendStartWALPush(Safekeeper *sk);
+static void RecvStartWALPushResult(Safekeeper *sk);
+static void SendProposerGreeting(Safekeeper *sk);
+static void RecvAcceptorGreeting(Safekeeper *sk);
+static void SendVoteRequest(Safekeeper *sk);
+static void RecvVoteResponse(Safekeeper *sk);
 static void HandleElectedProposer(void);
 static term_t GetHighestTerm(TermHistory *th);
-static term_t GetEpoch(WalKeeper *wk);
+static term_t GetEpoch(Safekeeper *sk);
 static void DetermineEpochStartLsn(void);
 static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
-static void SendProposerElected(WalKeeper *wk);
+static void SendProposerElected(Safekeeper *sk);
 static void WalProposerStartStreaming(XLogRecPtr startpos);
-static void StartStreaming(WalKeeper *wk);
+static void StartStreaming(Safekeeper *sk);
 static void SendMessageToNode(int i, WalMessage *msg);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
 static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
-static void HandleActiveState(WalKeeper *wk, uint32 events);
-static bool SendAppendRequests(WalKeeper *wk);
-static bool RecvAppendResponses(WalKeeper *wk);
+static void HandleActiveState(Safekeeper *sk, uint32 events);
+static bool SendAppendRequests(Safekeeper *sk);
+static bool RecvAppendResponses(Safekeeper *sk);
 static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
 static XLogRecPtr CalculateDiskConsistentLsn(void);
 static XLogRecPtr CalculateMinFlushLsn(void);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
-static void HandleWalKeeperResponse(void);
-static bool AsyncRead(WalKeeper *wk, char **buf, int *buf_size);
-static bool AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg);
-static bool BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state);
-static bool AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state);
-static bool AsyncFlush(WalKeeper *wk);
+static void HandleSafekeeperResponse(void);
+static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size);
+static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg);
+static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state);
+static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
+static bool AsyncFlush(Safekeeper *sk);
 
 /*
  * WAL proposer bgworker entry point.
@@ -263,27 +263,27 @@ WalProposerPoll(void)
 {
 	while (true)
 	{
-		WalKeeper  *wk;
+		Safekeeper  *sk;
 		int			rc;
 		WaitEvent	event;
 		TimestampTz now = GetCurrentTimestamp();
 
 		rc = WaitEventSetWait(waitEvents, TimeToReconnect(now),
 							  &event, 1, WAIT_EVENT_WAL_SENDER_MAIN);
-		wk = (WalKeeper *) event.user_data;
+		sk = (Safekeeper *) event.user_data;
 
 		/*
-		 * If the event contains something that one of our walkeeper states
+		 * If the event contains something that one of our safekeeper states
 		 * was waiting for, we'll advance its state.
 		 */
 		if (rc != 0 && (event.events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)))
-			AdvancePollState(wk, event.events);
+			AdvancePollState(sk, event.events);
 
 		/*
-		 * If the timeout expired, attempt to reconnect to any walkeepers that
+		 * If the timeout expired, attempt to reconnect to any safekeepers that
 		 * we dropped
 		 */
-		ReconnectWalKeepers();
+		ReconnectSafekeepers();
 
 		/*
 		 * If wait is terminated by latch set (walsenders' latch is set on
@@ -362,52 +362,52 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		sep = strchr(port, ',');
 		if (sep != NULL)
 			*sep++ = '\0';
-		if (n_walkeepers + 1 >= MAX_WALKEEPERS)
+		if (n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			elog(FATAL, "Too many walkeepers");
+			elog(FATAL, "Too many safekeepers");
 		}
-		walkeeper[n_walkeepers].host = host;
-		walkeeper[n_walkeepers].port = port;
-		walkeeper[n_walkeepers].state = SS_OFFLINE;
-		walkeeper[n_walkeepers].conn = NULL;
+		safekeeper[n_safekeepers].host = host;
+		safekeeper[n_safekeepers].port = port;
+		safekeeper[n_safekeepers].state = SS_OFFLINE;
+		safekeeper[n_safekeepers].conn = NULL;
 
 		/*
 		 * Set conninfo to empty. We'll fill it out once later, in
 		 * `ResetConnection` as needed
 		 */
-		walkeeper[n_walkeepers].conninfo[0] = '\0';
-		initStringInfo(&walkeeper[n_walkeepers].outbuf);
-		walkeeper[n_walkeepers].flushWrite = false;
-		walkeeper[n_walkeepers].currMsg = NULL;
-		walkeeper[n_walkeepers].ackMsg = NULL;
-		walkeeper[n_walkeepers].startStreamingAt = InvalidXLogRecPtr;
-		n_walkeepers += 1;
+		safekeeper[n_safekeepers].conninfo[0] = '\0';
+		initStringInfo(&safekeeper[n_safekeepers].outbuf);
+		safekeeper[n_safekeepers].flushWrite = false;
+		safekeeper[n_safekeepers].currMsg = NULL;
+		safekeeper[n_safekeepers].ackMsg = NULL;
+		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
+		n_safekeepers += 1;
 	}
-	if (n_walkeepers < 1)
+	if (n_safekeepers < 1)
 	{
-		elog(FATAL, "WalKeepers addresses are not specified");
+		elog(FATAL, "Safekeepers addresses are not specified");
 	}
-	quorum = n_walkeepers / 2 + 1;
+	quorum = n_safekeepers / 2 + 1;
 
 	/* Fill the greeting package */
-	proposerGreeting.tag = 'g';
-	proposerGreeting.protocolVersion = SK_PROTOCOL_VERSION;
-	proposerGreeting.pgVersion = PG_VERSION_NUM;
-	pg_strong_random(&proposerGreeting.proposerId, sizeof(proposerGreeting.proposerId));
-	proposerGreeting.systemId = systemId;
+	greetRequest.tag = 'g';
+	greetRequest.protocolVersion = SK_PROTOCOL_VERSION;
+	greetRequest.pgVersion = PG_VERSION_NUM;
+	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
+	greetRequest.systemId = systemId;
 	if (!zenith_timeline_walproposer)
 		elog(FATAL, "zenith.zenith_timeline is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztimelineid, zenith_timeline_walproposer, 16))
+		!HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
 	if (!zenith_tenant_walproposer)
 		elog(FATAL, "zenith.zenith_tenant is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
-		!HexDecodeString(proposerGreeting.ztenantid, zenith_tenant_walproposer, 16))
+		!HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16))
 		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
 
-	proposerGreeting.timeline = ThisTimeLineID;
-	proposerGreeting.walSegSize = wal_segment_size;
+	greetRequest.timeline = ThisTimeLineID;
+	greetRequest.walSegSize = wal_segment_size;
 
 	InitEventSet();
 }
@@ -416,10 +416,10 @@ static void
 WalProposerStart(void)
 {
 
-	/* Initiate connections to all walkeeper nodes */
-	for (int i = 0; i < n_walkeepers; i++)
+	/* Initiate connections to all safekeeper nodes */
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		ResetConnection(&walkeeper[i]);
+		ResetConnection(&safekeeper[i]);
 	}
 
 	WalProposerLoop();
@@ -439,7 +439,7 @@ InitEventSet(void)
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_walkeepers);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
@@ -447,20 +447,20 @@ InitEventSet(void)
 }
 
 /*
- * Updates the events we're already waiting on for the WAL keeper, setting it to
+ * Updates the events we're already waiting on for the safekeeper, setting it to
  * the provided `events`
  *
- * This function is called any time the WAL keeper's state switches to one where
+ * This function is called any time the safekeeper's state switches to one where
  * it has to wait to continue. This includes the full body of AdvancePollState
  * and calls to IO helper functions.
  */
 static void
-UpdateEventSet(WalKeeper *wk, uint32 events)
+UpdateEventSet(Safekeeper *sk, uint32 events)
 {
 	/* eventPos = -1 when we don't have an event */
-	Assert(wk->eventPos != -1);
+	Assert(sk->eventPos != -1);
 
-	ModifyWaitEvent(waitEvents, wk->eventPos, events, NULL);
+	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }
 
 /* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
@@ -468,7 +468,7 @@ UpdateEventSet(WalKeeper *wk, uint32 events)
  * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
  */
 static void
-HackyRemoveWalProposerEvent(WalKeeper *to_remove)
+HackyRemoveWalProposerEvent(Safekeeper *to_remove)
 {
 	/* Remove the existing event set */
 	if (waitEvents)
@@ -476,50 +476,50 @@ HackyRemoveWalProposerEvent(WalKeeper *to_remove)
 		FreeWaitEventSet(waitEvents);
 		waitEvents = NULL;
 	}
-	/* Re-initialize it without adding any walkeeper events */
+	/* Re-initialize it without adding any safekeeper events */
 	InitEventSet();
 
 	/*
-	 * loop through the existing walkeepers. If they aren't the one we're
+	 * loop through the existing safekeepers. If they aren't the one we're
 	 * removing, and if they have a socket we can use, re-add the applicable
 	 * events.
 	 */
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
 		uint32		desired_events = WL_NO_EVENTS;
-		WalKeeper  *wk = &walkeeper[i];
+		Safekeeper  *sk = &safekeeper[i];
 
-		wk->eventPos = -1;
+		sk->eventPos = -1;
 
-		if (wk == to_remove)
+		if (sk == to_remove)
 			continue;
 
-		/* If this WAL keeper isn't offline, add an event for it! */
-		if (wk->conn != NULL)
+		/* If this safekeeper isn't offline, add an event for it! */
+		if (sk->conn != NULL)
 		{
-			desired_events = WalKeeperStateDesiredEvents(wk->state);
-			wk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(wk->conn), NULL, wk);
+			desired_events = SafekeeperStateDesiredEvents(sk->state);
+			sk->eventPos = AddWaitEventToSet(waitEvents, desired_events, walprop_socket(sk->conn), NULL, sk);
 		}
 	}
 }
 
-/* Shuts down and cleans up the connection for a walkeeper. Sets its state to SS_OFFLINE */
+/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
 static void
-ShutdownConnection(WalKeeper *wk)
+ShutdownConnection(Safekeeper *sk)
 {
-	if (wk->conn)
-		walprop_finish(wk->conn);
-	wk->conn = NULL;
-	wk->state = SS_OFFLINE;
-	wk->flushWrite = false;
-	wk->currMsg = NULL;
-	wk->ackMsg = NULL;
-
-	if (wk->voteResponse.termHistory.entries)
-		pfree(wk->voteResponse.termHistory.entries);
-	wk->voteResponse.termHistory.entries = NULL;
-
-	HackyRemoveWalProposerEvent(wk);
+	if (sk->conn)
+		walprop_finish(sk->conn);
+	sk->conn = NULL;
+	sk->state = SS_OFFLINE;
+	sk->flushWrite = false;
+	sk->currMsg = NULL;
+	sk->ackMsg = NULL;
+
+	if (sk->voteResponse.termHistory.entries)
+		pfree(sk->voteResponse.termHistory.entries);
+	sk->voteResponse.termHistory.entries = NULL;
+
+	HackyRemoveWalProposerEvent(sk);
 }
 
 /*
@@ -529,13 +529,13 @@ ShutdownConnection(WalKeeper *wk)
  * On success, sets the state to SS_CONNECTING_WRITE.
  */
 static void
-ResetConnection(WalKeeper *wk)
+ResetConnection(Safekeeper *sk)
 {
 	pgsocket	sock;			/* socket of the new connection */
 
-	if (wk->state != SS_OFFLINE)
+	if (sk->state != SS_OFFLINE)
 	{
-		ShutdownConnection(wk);
+		ShutdownConnection(sk);
 	}
 
 	/*
@@ -544,25 +544,25 @@ ResetConnection(WalKeeper *wk)
 	 * If the connection information hasn't been filled out, we need to do
 	 * that here.
 	 */
-	if (wk->conninfo[0] == '\0')
+	if (sk->conninfo[0] == '\0')
 	{
 		int written = 0;
-		written = snprintf((char *) &wk->conninfo, MAXCONNINFO,
+		written = snprintf((char *) &sk->conninfo, MAXCONNINFO,
 				"host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-				wk->host, wk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+				sk->host, sk->port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 		// currently connection string is not that long, but once we pass something like jwt we might overflow the buffer,
 		// so it is better to be defensive and check that everything aligns well
 		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not create connection string for walkeeper %s:%s", wk->host, wk->port);
+			elog(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 	}
 
-	wk->conn = walprop_connect_start((char *) &wk->conninfo);
+	sk->conn = walprop_connect_start((char *) &sk->conninfo);
 
 	/*
 	 * "If the result is null, then libpq has been unable to allocate a new
 	 * PGconn structure"
 	 */
-	if (!wk->conn)
+	if (!sk->conn)
 		elog(FATAL, "failed to allocate new PGconn object");
 
 	/*
@@ -570,7 +570,7 @@ ResetConnection(WalKeeper *wk)
 	 * PQconnectPoll. Before we do that though, we need to check that it
 	 * didn't immediately fail.
 	 */
-	if (walprop_status(wk->conn) == WP_CONNECTION_BAD)
+	if (walprop_status(sk->conn) == WP_CONNECTION_BAD)
 	{
 		/*---
 		 * According to libpq docs:
@@ -581,14 +581,14 @@ ResetConnection(WalKeeper *wk)
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
 		elog(WARNING, "Immediate failure to connect with node:\n\t%s\n\terror: %s",
-			 wk->conninfo, walprop_error_message(wk->conn));
+			 sk->conninfo, walprop_error_message(sk->conn));
 
 		/*
 		 * Even though the connection failed, we still need to clean up the
 		 * object
 		 */
-		walprop_finish(wk->conn);
-		wk->conn = NULL;
+		walprop_finish(sk->conn);
+		sk->conn = NULL;
 		return;
 	}
 
@@ -605,12 +605,12 @@ ResetConnection(WalKeeper *wk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	elog(LOG, "Connecting with node %s:%s", wk->host, wk->port);
+	elog(LOG, "Connecting with node %s:%s", sk->host, sk->port);
 
-	wk->state = SS_CONNECTING_WRITE;
+	sk->state = SS_CONNECTING_WRITE;
 
-	sock = walprop_socket(wk->conn);
-	wk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, wk);
+	sock = walprop_socket(sk->conn);
+	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
 	return;
 }
 
@@ -635,46 +635,46 @@ TimeToReconnect(TimestampTz now)
 	return (long) (till_reconnect / 1000);
 }
 
-/* If the timeout has expired, attempt to reconnect to all offline walkeepers */
+/* If the timeout has expired, attempt to reconnect to all offline safekeepers */
 static void
-ReconnectWalKeepers(void)
+ReconnectSafekeepers(void)
 {
 	TimestampTz now = GetCurrentTimestamp();
 
 	if (TimeToReconnect(now) == 0)
 	{
 		last_reconnect_attempt = now;
-		for (int i = 0; i < n_walkeepers; i++)
+		for (int i = 0; i < n_safekeepers; i++)
 		{
-			if (walkeeper[i].state == SS_OFFLINE)
-				ResetConnection(&walkeeper[i]);
+			if (safekeeper[i].state == SS_OFFLINE)
+				ResetConnection(&safekeeper[i]);
 		}
 	}
 }
 
 /*
- * Performs the logic for advancing the state machine of the specified walkeeper,
+ * Performs the logic for advancing the state machine of the specified safekeeper,
  * given that a certain set of events has occured.
  */
 static void
-AdvancePollState(WalKeeper *wk, uint32 events)
+AdvancePollState(Safekeeper *sk, uint32 events)
 {
 	/*
 	 * Sanity check. We assume further down that the operations don't
 	 * block because the socket is ready.
 	 */
-	AssertEventsOkForState(events, wk);
+	AssertEventsOkForState(events, sk);
 
 	/* Execute the code corresponding to the current state */
-	switch (wk->state)
+	switch (sk->state)
 	{
 			/*
-			 * WAL keepers are only taken out of SS_OFFLINE by calls to
+			 * safekeepers are only taken out of SS_OFFLINE by calls to
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			elog(FATAL, "Unexpected walkeeper %s:%s state advancement: is offline",
-					wk->host, wk->port);
+			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
+					sk->host, sk->port);
 			break;			/* actually unreachable, but prevents
 							 * -Wimplicit-fallthrough */
 
@@ -684,21 +684,21 @@ AdvancePollState(WalKeeper *wk, uint32 events)
 			 */
 		case SS_CONNECTING_READ:
 		case SS_CONNECTING_WRITE:
-			HandleConnectionEvent(wk);
+			HandleConnectionEvent(sk);
 			break;
 
 			/*
 			 * Waiting for a successful CopyBoth response.
 			 */
 		case SS_WAIT_EXEC_RESULT:
-			RecvStartWALPushResult(wk);
+			RecvStartWALPushResult(sk);
 			break;
 
 			/*
 			 * Finish handshake comms: receive information about the safekeeper.
 			 */
 		case SS_HANDSHAKE_RECV:
-			RecvAcceptorGreeting(wk);
+			RecvAcceptorGreeting(sk);
 			break;
 
 			/*
@@ -707,14 +707,14 @@ AdvancePollState(WalKeeper *wk, uint32 events)
 			 * transferred from SS_VOTING to sending actual vote requests.
 			 */
 		case SS_VOTING:
-			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					wk->port, FormatWalKeeperState(wk->state));
-			ResetConnection(wk);
+			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+					sk->port, FormatSafekeeperState(sk->state));
+			ResetConnection(sk);
 			return;
 
 			/* Read the safekeeper response for our candidate */
 		case SS_WAIT_VERDICT:
-			RecvVoteResponse(wk);
+			RecvVoteResponse(sk);
 			break;
 
 			/* Flush proposer announcement message */
@@ -725,35 +725,35 @@ AdvancePollState(WalKeeper *wk, uint32 events)
 			 * completes. If we still have more to do, we'll wait until the next
 			 * poll comes along.
 			 */
-			if (!AsyncFlush(wk))
+			if (!AsyncFlush(sk))
 				return;
 			
 			/* flush is done, event set and state will be updated later */
-			StartStreaming(wk);
+			StartStreaming(sk);
 			break;
 
 			/*
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			elog(WARNING, "EOF from node %s:%s in %s state", wk->host,
-					wk->port, FormatWalKeeperState(wk->state));
-			ResetConnection(wk);
+			elog(WARNING, "EOF from node %s:%s in %s state", sk->host,
+					sk->port, FormatSafekeeperState(sk->state));
+			ResetConnection(sk);
 			return;
 
 			/*
 			 * Active state is used for streaming WAL and receiving feedback.
 			 */
 		case SS_ACTIVE:
-			HandleActiveState(wk, events);
+			HandleActiveState(sk, events);
 			break;
 	}
 }
 
 static void
-HandleConnectionEvent(WalKeeper *wk)
+HandleConnectionEvent(Safekeeper *sk)
 {
-	WalProposerConnectPollStatusType result = walprop_connect_poll(wk->conn);
+	WalProposerConnectPollStatusType result = walprop_connect_poll(sk->conn);
 
 	/* The new set of events we'll wait on, after updating */
 	uint32		new_events = WL_NO_EVENTS;
@@ -761,8 +761,8 @@ HandleConnectionEvent(WalKeeper *wk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			elog(LOG, "connected with node %s:%s", wk->host,
-					wk->port);
+			elog(LOG, "connected with node %s:%s", sk->host,
+					sk->port);
 
 			/*
 			 * We have to pick some event to update event set.
@@ -777,26 +777,26 @@ HandleConnectionEvent(WalKeeper *wk)
 			 * continue doing that
 			 */
 		case WP_CONN_POLLING_READING:
-			wk->state = SS_CONNECTING_READ;
+			sk->state = SS_CONNECTING_READ;
 			new_events = WL_SOCKET_READABLE;
 			break;
 		case WP_CONN_POLLING_WRITING:
-			wk->state = SS_CONNECTING_WRITE;
+			sk->state = SS_CONNECTING_WRITE;
 			new_events = WL_SOCKET_WRITEABLE;
 			break;
 
 		case WP_CONN_POLLING_FAILED:
 			elog(WARNING, "Failed to connect to node '%s:%s': %s",
-					wk->host, wk->port, walprop_error_message(wk->conn));
+					sk->host, sk->port, walprop_error_message(sk->conn));
 
 			/*
 			 * If connecting failed, we don't want to restart
 			 * the connection because that might run us into a
 			 * loop. Instead, shut it down -- it'll naturally
 			 * restart at a slower interval on calls to
-			 * ReconnectWalKeepers.
+			 * ReconnectSafekeepers.
 			 */
-			ShutdownConnection(wk);
+			ShutdownConnection(sk);
 			return;
 	}
 
@@ -805,21 +805,21 @@ HandleConnectionEvent(WalKeeper *wk)
 	 * un-register the old event and re-register an event on
 	 * the new socket.
 	 */
-	HackyRemoveWalProposerEvent(wk);
-	wk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(wk->conn), NULL, wk);
+	HackyRemoveWalProposerEvent(sk);
+	sk->eventPos = AddWaitEventToSet(waitEvents, new_events, walprop_socket(sk->conn), NULL, sk);
 
 	/* If we successfully connected, send START_WAL_PUSH query */
 	if (result == WP_CONN_POLLING_OK)
-		SendStartWALPush(wk);
+		SendStartWALPush(sk);
 }
 
 /*
- * Send "START_WAL_PUSH" message as an empty query to the walkeeper. Performs
+ * Send "START_WAL_PUSH" message as an empty query to the safekeeper. Performs
  * a blocking send, then immediately moves to SS_WAIT_EXEC_RESULT. If something
  * goes wrong, change state to SS_OFFLINE and shutdown the connection.
  */
 static void
-SendStartWALPush(WalKeeper *wk)
+SendStartWALPush(Safekeeper *sk)
 {
 	char *query = NULL;
 	if (zenith_pageserver_connstring_walproposer != NULL) {
@@ -827,23 +827,23 @@ SendStartWALPush(WalKeeper *wk)
 	} else {
 		query = psprintf("START_WAL_PUSH");
 	}
-	if (!walprop_send_query(wk->conn, query))
+	if (!walprop_send_query(sk->conn, query))
 	{
 		pfree(query);
-		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to walkeeper %s:%s: %s",
-			wk->host, wk->port, walprop_error_message(wk->conn));
-		ShutdownConnection(wk);
+		elog(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			sk->host, sk->port, walprop_error_message(sk->conn));
+		ShutdownConnection(sk);
 		return;
 	}
 	pfree(query);
-	wk->state = SS_WAIT_EXEC_RESULT;
-	UpdateEventSet(wk, WL_SOCKET_READABLE);
+	sk->state = SS_WAIT_EXEC_RESULT;
+	UpdateEventSet(sk, WL_SOCKET_READABLE);
 }
 
 static void
-RecvStartWALPushResult(WalKeeper *wk)
+RecvStartWALPushResult(Safekeeper *sk)
 {
-	switch (walprop_get_query_result(wk->conn))
+	switch (walprop_get_query_result(sk->conn))
 	{
 			/*
 			 * Successful result, move on to starting the
@@ -851,7 +851,7 @@ RecvStartWALPushResult(WalKeeper *wk)
 			 */
 		case WP_EXEC_SUCCESS_COPYBOTH:
 
-			SendProposerGreeting(wk);
+			SendProposerGreeting(sk);
 			break;
 
 			/*
@@ -867,9 +867,9 @@ RecvStartWALPushResult(WalKeeper *wk)
 			break;
 
 		case WP_EXEC_FAILED:
-			elog(WARNING, "Failed to send query to walkeeper %s:%s: %s",
-					wk->host, wk->port, walprop_error_message(wk->conn));
-			ShutdownConnection(wk);
+			elog(WARNING, "Failed to send query to safekeeper %s:%s: %s",
+					sk->host, sk->port, walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
 			return;
 
 			/*
@@ -878,50 +878,50 @@ RecvStartWALPushResult(WalKeeper *wk)
 			 * generic "something went wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			elog(WARNING, "Received bad response from walkeeper %s:%s query execution",
-					wk->host, wk->port);
-			ShutdownConnection(wk);
+			elog(WARNING, "Received bad response from safekeeper %s:%s query execution",
+					sk->host, sk->port);
+			ShutdownConnection(sk);
 			return;
 	}
 }
 
 /*
  * Start handshake: first of all send information about the
- * WAL keeper. After sending, we wait on SS_HANDSHAKE_RECV for
+ * safekeeper. After sending, we wait on SS_HANDSHAKE_RECV for
  * a response to finish the handshake.
  */
 static void
-SendProposerGreeting(WalKeeper *wk)
+SendProposerGreeting(Safekeeper *sk)
 {
 	/*
 	 * On failure, logging & resetting the connection is handled.
 	 * We just need to handle the control flow.
 	 */
-	BlockingWrite(wk, &proposerGreeting, sizeof(proposerGreeting), SS_HANDSHAKE_RECV);
+	BlockingWrite(sk, &greetRequest, sizeof(greetRequest), SS_HANDSHAKE_RECV);
 }
 
 static void
-RecvAcceptorGreeting(WalKeeper *wk)
+RecvAcceptorGreeting(Safekeeper *sk)
 {
 	/*
 	 * If our reading doesn't immediately succeed, any necessary
 	 * error handling or state setting is taken care of. We can
 	 * leave any other work until later.
 	 */
-	wk->greet.apm.tag = 'g';
-	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->greet))
+	sk->greetResponse.apm.tag = 'g';
+	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
 	/* Protocol is all good, move to voting. */
-	wk->state = SS_VOTING;
-	wk->feedback.flushLsn = truncateLsn;
-	wk->feedback.hs.ts = 0;
+	sk->state = SS_VOTING;
+	sk->appendResponse.flushLsn = truncateLsn;
+	sk->appendResponse.hs.ts = 0;
 
 	++n_connected;
 	if (n_connected <= quorum)
 	{
 		/* We're still collecting terms from the majority. */
-		propTerm = Max(wk->greet.term, propTerm);
+		propTerm = Max(sk->greetResponse.term, propTerm);
 
 		/* Quorum is acquried, prepare the vote request. */
 		if (n_connected == quorum)
@@ -934,15 +934,15 @@ RecvAcceptorGreeting(WalKeeper *wk)
 				.tag = 'v',
 					.term = propTerm
 			};
-			memcpy(voteRequest.proposerId.data, proposerGreeting.proposerId.data, UUID_LEN);
+			memcpy(voteRequest.proposerId.data, greetRequest.proposerId.data, UUID_LEN);
 		}
 	}
-	else if (wk->greet.term > propTerm)
+	else if (sk->greetResponse.term > propTerm)
 	{
 		/* Another compute with higher term is running. */	
 		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-				wk->host, wk->port,
-				wk->greet.term, propTerm);
+				sk->host, sk->port,
+				sk->greetResponse.term, propTerm);
 	}
 
 	/*
@@ -958,7 +958,7 @@ RecvAcceptorGreeting(WalKeeper *wk)
 		 * SS_VOTING is an idle state; read-ready indicates the
 		 * connection closed.
 		 */
-		UpdateEventSet(wk, WL_SOCKET_READABLE);
+		UpdateEventSet(sk, WL_SOCKET_READABLE);
 	}
 	else
 	{
@@ -966,43 +966,43 @@ RecvAcceptorGreeting(WalKeeper *wk)
 		 * Now send voting request to the cohort and wait
 		 * responses
 		 */
-		for (int j = 0; j < n_walkeepers; j++)
+		for (int j = 0; j < n_safekeepers; j++)
 		{
 			/*
 			 * Remember: SS_VOTING indicates that the safekeeper is
 			 * participating in voting, but hasn't sent anything
 			 * yet.
 			 */
-			if (walkeeper[j].state == SS_VOTING)
-				SendVoteRequest(&walkeeper[j]);
+			if (safekeeper[j].state == SS_VOTING)
+				SendVoteRequest(&safekeeper[j]);
 		}
 	}
 }
 
 static void
-SendVoteRequest(WalKeeper *wk)
+SendVoteRequest(Safekeeper *sk)
 {
 	/* We have quorum for voting, send our vote request */
-	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, wk->host, wk->port, voteRequest.term);
+	elog(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, voteRequest.term);
 	/* On failure, logging & resetting is handled */
-	if (!BlockingWrite(wk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
+	if (!BlockingWrite(sk, &voteRequest, sizeof(voteRequest), SS_WAIT_VERDICT))
 		return;
 
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
 }
 
 static void
-RecvVoteResponse(WalKeeper *wk)
+RecvVoteResponse(Safekeeper *sk)
 {
-	wk->voteResponse.apm.tag = 'v';
-	if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->voteResponse))
+	sk->voteResponse.apm.tag = 'v';
+	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;
 
 	elog(LOG,
 			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-			wk->host, wk->port, wk->voteResponse.voteGiven, GetHighestTerm(&wk->voteResponse.termHistory),
-			LSN_FORMAT_ARGS(wk->voteResponse.flushLsn),
-			LSN_FORMAT_ARGS(wk->voteResponse.truncateLsn));
+			sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+			LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only
@@ -1010,30 +1010,30 @@ RecvVoteResponse(WalKeeper *wk)
 	 * (concurrent compute spotted) or we are not elected yet and
 	 * thus need the vote.
 	 */
-	if ((!wk->voteResponse.voteGiven) &&
-		(wk->voteResponse.term > propTerm || n_votes < quorum))
+	if ((!sk->voteResponse.voteGiven) &&
+		(sk->voteResponse.term > propTerm || n_votes < quorum))
 	{
 		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-				wk->host, wk->port,
-				wk->voteResponse.term, propTerm);
+				sk->host, sk->port,
+				sk->voteResponse.term, propTerm);
 	}
-	Assert(wk->voteResponse.term == propTerm);
+	Assert(sk->voteResponse.term == propTerm);
 
 	/* Handshake completed, do we have quorum? */
 	n_votes++;
 	if (n_votes < quorum)
 	{
-		wk->state = SS_IDLE; /* can't do much yet, no quorum */
+		sk->state = SS_IDLE; /* can't do much yet, no quorum */
 	}
 	else if (n_votes > quorum)
 	{
 		/* recovery already performed, just start streaming */
-		SendProposerElected(wk);
+		SendProposerElected(sk);
 	}
 	else
 	{
-		wk->state = SS_IDLE;
-		UpdateEventSet(wk, WL_SOCKET_READABLE); /* Idle states wait for
+		sk->state = SS_IDLE;
+		UpdateEventSet(sk, WL_SOCKET_READABLE); /* Idle states wait for
 												 * read-ready */
 
 		HandleElectedProposer();
@@ -1064,7 +1064,7 @@ HandleElectedProposer(void)
 				LSN_FORMAT_ARGS(truncateLsn),
 				LSN_FORMAT_ARGS(propEpochStartLsn));
 		/* Perform recovery */
-		if (!WalProposerRecovery(donor, proposerGreeting.timeline, truncateLsn, propEpochStartLsn))
+		if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
 			elog(FATAL, "Failed to recover state");
 	}
 	else if (syncSafekeepers)
@@ -1074,10 +1074,10 @@ HandleElectedProposer(void)
 		exit(0);
 	}
 
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE)
-			SendProposerElected(&walkeeper[i]);
+		if (safekeeper[i].state == SS_IDLE)
+			SendProposerElected(&safekeeper[i]);
 	}
 
 	/* 
@@ -1097,7 +1097,7 @@ HandleElectedProposer(void)
 			*/
 		BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
 
-		/* keep polling until all walkeepers are synced */
+		/* keep polling until all safekeepers are synced */
 		return;
 	}
 
@@ -1114,9 +1114,9 @@ GetHighestTerm(TermHistory *th)
 
 /* safekeeper's epoch is the term of the highest entry in the log */
 static term_t
-GetEpoch(WalKeeper *wk)
+GetEpoch(Safekeeper *sk)
 {
-	return GetHighestTerm(&wk->voteResponse.termHistory);
+	return GetHighestTerm(&sk->voteResponse.termHistory);
 }
 
 /*
@@ -1136,19 +1136,19 @@ DetermineEpochStartLsn(void)
 	donorEpoch = 0;
 	truncateLsn = InvalidXLogRecPtr;
 
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].state == SS_IDLE)
+		if (safekeeper[i].state == SS_IDLE)
 		{
-			if (GetEpoch(&walkeeper[i]) > donorEpoch ||
-				(GetEpoch(&walkeeper[i]) == donorEpoch &&
-				 walkeeper[i].voteResponse.flushLsn > propEpochStartLsn))
+			if (GetEpoch(&safekeeper[i]) > donorEpoch ||
+				(GetEpoch(&safekeeper[i]) == donorEpoch &&
+				 safekeeper[i].voteResponse.flushLsn > propEpochStartLsn))
 			{
-				donorEpoch = GetEpoch(&walkeeper[i]);
-				propEpochStartLsn = walkeeper[i].voteResponse.flushLsn;
+				donorEpoch = GetEpoch(&safekeeper[i]);
+				propEpochStartLsn = safekeeper[i].voteResponse.flushLsn;
 				donor = i;
 			}
-			truncateLsn = Max(walkeeper[i].voteResponse.truncateLsn, truncateLsn);
+			truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn);
 		}
 	}
 
@@ -1177,7 +1177,7 @@ DetermineEpochStartLsn(void)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &walkeeper[donor].voteResponse.termHistory;
+	dth = &safekeeper[donor].voteResponse.termHistory;
 	propTermHistory.n_entries = dth->n_entries + 1;
 	propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * propTermHistory.n_entries);
 	memcpy(propTermHistory.entries, dth->entries, sizeof(TermSwitchEntry) * dth->n_entries);
@@ -1188,13 +1188,13 @@ DetermineEpochStartLsn(void)
 		 quorum,
 		 propTerm,
 		 LSN_FORMAT_ARGS(propEpochStartLsn),
-		 walkeeper[donor].host, walkeeper[donor].port,
+		 safekeeper[donor].host, safekeeper[donor].port,
 		 LSN_FORMAT_ARGS(truncateLsn)
 		);
 }
 
 /*
- * Receive WAL from most advanced WAL keeper
+ * Receive WAL from most advanced safekeeper
  */
 static bool
 WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
@@ -1205,20 +1205,20 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	WalRcvStreamOptions options;
 
 	sprintf(conninfo, "host=%s port=%s dbname=replication options='-c ztimelineid=%s ztenantid=%s'",
-			walkeeper[donor].host, walkeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
+			safekeeper[donor].host, safekeeper[donor].port, zenith_timeline_walproposer, zenith_tenant_walproposer);
 	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
 				(errmsg("could not connect to WAL acceptor %s:%s: %s",
-						walkeeper[donor].host, walkeeper[donor].port,
+						safekeeper[donor].host, safekeeper[donor].port,
 						err)));
 		return false;
 	}
 	elog(LOG,
 		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
 		 "%d",
-		 walkeeper[donor].host, walkeeper[donor].port, (uint32) (startpos >> 32),
+		 safekeeper[donor].host, safekeeper[donor].port, (uint32) (startpos >> 32),
 		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
@@ -1274,16 +1274,16 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 }
 
 /*
- * Determine for wk the starting streaming point and send it message
+ * Determine for sk the starting streaming point and send it message
  * 1) Announcing we are elected proposer (which immediately advances epoch if
  *    safekeeper is synced, being important for sync-safekeepers)
  * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
  *    beyond it -- and history of term switching.
  * 
- * Sets wk->startStreamingAt.
+ * Sets sk->startStreamingAt.
  */
 static void
-SendProposerElected(WalKeeper *wk)
+SendProposerElected(Safekeeper *sk)
 {
 	ProposerElected msg;
 	TermHistory *th;
@@ -1298,13 +1298,13 @@ SendProposerElected(WalKeeper *wk)
 	 * there is some WAL on safekeeper, if immediately after bootstrap compute
 	 * wrote some WAL on single sk and died; we stream since the beginning then.
 	 */
-	th = &wk->voteResponse.termHistory;
+	th = &sk->voteResponse.termHistory;
 	/* 
 	 * If any WAL is present on the sk, it must be authorized by some term.
 	 * OTOH, without any WAL there are no term swiches in the log.
 	 */
 	Assert((th->n_entries == 0) ==
-		   (wk->voteResponse.flushLsn == InvalidXLogRecPtr));
+		   (sk->voteResponse.flushLsn == InvalidXLogRecPtr));
 	/* We must start somewhere. */
 	Assert(propTermHistory.n_entries >= 1);
 
@@ -1319,7 +1319,7 @@ SendProposerElected(WalKeeper *wk)
 	if (i < 0)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
-		wk->startStreamingAt = propTermHistory.entries[0].lsn;
+		sk->startStreamingAt = propTermHistory.entries[0].lsn;
 	}
 	else
 	{
@@ -1331,44 +1331,44 @@ SendProposerElected(WalKeeper *wk)
 		 */
 		if (propTermHistory.entries[i].term == propTerm)
 		{
-			wk->startStreamingAt = wk->voteResponse.flushLsn;
+			sk->startStreamingAt = sk->voteResponse.flushLsn;
 		}
 		else
 		{
 			XLogRecPtr propEndLsn = propTermHistory.entries[i + 1].lsn;
 			XLogRecPtr skEndLsn = (i + 1 < th->n_entries ? th->entries[i + 1].lsn :
-														   wk->voteResponse.flushLsn);
-			wk->startStreamingAt = Min(propEndLsn, skEndLsn);
+														   sk->voteResponse.flushLsn);
+			sk->startStreamingAt = Min(propEndLsn, skEndLsn);
 		}
 	}
 
-	Assert(msgQueueHead == NULL || wk->startStreamingAt >= msgQueueHead->req.beginLsn);
+	Assert(msgQueueHead == NULL || sk->startStreamingAt >= msgQueueHead->req.beginLsn);
 
 	msg.tag = 'e';
 	msg.term = propTerm;
-	msg.startStreamingAt = wk->startStreamingAt;
+	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &propTermHistory;
 
 	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
 	elog(LOG,
 		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, wk->host, wk->port);
+		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
 	
-	resetStringInfo(&wk->outbuf);
-	pq_sendint64_le(&wk->outbuf, msg.tag);
-	pq_sendint64_le(&wk->outbuf, msg.term);
-	pq_sendint64_le(&wk->outbuf, msg.startStreamingAt);
-	pq_sendint32_le(&wk->outbuf, msg.termHistory->n_entries);
+	resetStringInfo(&sk->outbuf);
+	pq_sendint64_le(&sk->outbuf, msg.tag);
+	pq_sendint64_le(&sk->outbuf, msg.term);
+	pq_sendint64_le(&sk->outbuf, msg.startStreamingAt);
+	pq_sendint32_le(&sk->outbuf, msg.termHistory->n_entries);
 	for (int i = 0; i < msg.termHistory->n_entries; i++)
 	{
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].term);
-		pq_sendint64_le(&wk->outbuf, msg.termHistory->entries[i].lsn);
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
+		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
 	}
 
-	if (!AsyncWrite(wk, wk->outbuf.data, wk->outbuf.len, SS_SEND_ELECTED_FLUSH))
+	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;
 
-	StartStreaming(wk);
+	StartStreaming(sk);
 }
 
 /*
@@ -1382,31 +1382,31 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 	elog(LOG, "WAL proposer starts streaming at %X/%X",
 		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
-	cmd.timeline = proposerGreeting.timeline;
+	cmd.timeline = greetRequest.timeline;
 	cmd.startpoint = startpos;
 	StartReplication(&cmd);
 }
 
 /*
- * Start streaming to safekeeper wk, always updates state to SS_ACTIVE and sets
+ * Start streaming to safekeeper sk, always updates state to SS_ACTIVE and sets
  * correct event set.
  */
 static void
-StartStreaming(WalKeeper *wk)
+StartStreaming(Safekeeper *sk)
 {
-	int wki = wk - walkeeper;
+	int wki = sk - safekeeper;
 
 	/* 
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
 	 * exactly once for a connection.
 	 */
-	wk->state = SS_ACTIVE;
+	sk->state = SS_ACTIVE;
 
 	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
 	{
-		if (msg->req.endLsn <= wk->startStreamingAt)
+		if (msg->req.endLsn <= sk->startStreamingAt)
 		{
-			/* message is already received by this walkeeper */
+			/* message is already received by this safekeeper */
 			msg->ackMask |= 1 << wki;
 		}
 		else
@@ -1418,7 +1418,7 @@ StartStreaming(WalKeeper *wk)
 	}
 
 	/* Call SS_ACTIVE handler to update event set */
-	HandleActiveState(wk, WL_NO_EVENTS);
+	HandleActiveState(sk, WL_NO_EVENTS);
 }
 
 /*
@@ -1430,11 +1430,11 @@ StartStreaming(WalKeeper *wk)
 static void
 SendMessageToNode(int i, WalMessage *msg)
 {
-	WalKeeper  *wk = &walkeeper[i];
+	Safekeeper  *sk = &safekeeper[i];
 
 	/* we shouldn't be already sending something */
-	Assert(wk->currMsg == NULL);
-	Assert(wk->state == SS_ACTIVE);
+	Assert(sk->currMsg == NULL);
+	Assert(sk->state == SS_ACTIVE);
 
 	/*
 	 * Skip already acknowledged messages. Used after reconnection to get to
@@ -1443,21 +1443,21 @@ SendMessageToNode(int i, WalMessage *msg)
 	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
 		msg = msg->next;
 
-	wk->currMsg = msg;
+	sk->currMsg = msg;
 
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
-	HandleActiveState(wk, WL_SOCKET_WRITEABLE);
+	HandleActiveState(sk, WL_SOCKET_WRITEABLE);
 }
 
 /*
- * Broadcast new message to all caught-up walkeepers
+ * Broadcast new message to all caught-up safekeepers
  */
 static void
 BroadcastMessage(WalMessage *msg)
 {
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].state == SS_ACTIVE && walkeeper[i].currMsg == NULL)
+		if (safekeeper[i].state == SS_ACTIVE && safekeeper[i].currMsg == NULL)
 		{
 			SendMessageToNode(i, msg);
 		}
@@ -1494,7 +1494,7 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 	msg->req.epochStartLsn = propEpochStartLsn;
 	msg->req.beginLsn = startpos;
 	msg->req.endLsn = endpos;
-	msg->req.proposerId = proposerGreeting.proposerId;
+	msg->req.proposerId = greetRequest.proposerId;
 	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
 
 	Assert(msg->req.endLsn >= lastSentLsn);
@@ -1503,7 +1503,7 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 }
 
 /*
- * Create WAL message with no data, just to let the walkeepers
+ * Create WAL message with no data, just to let the safekeepers
  * know that commit lsn has advanced.
  */
 static WalMessage *
@@ -1537,7 +1537,7 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 	 */
 	msg->req.beginLsn = lsn;
 	msg->req.endLsn = lsn;
-	msg->req.proposerId = proposerGreeting.proposerId;
+	msg->req.proposerId = greetRequest.proposerId;
 
 	/*
 	 * truncateLsn and commitLsn are set just before the message sent, in
@@ -1550,64 +1550,64 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
  * Process all events happened in SS_ACTIVE state, update event set after that.
  */
 static void
-HandleActiveState(WalKeeper *wk, uint32 events)
+HandleActiveState(Safekeeper *sk, uint32 events)
 {
 	uint32 newEvents = WL_SOCKET_READABLE;
 
 	if (events & WL_SOCKET_WRITEABLE)
-		if (!SendAppendRequests(wk))
+		if (!SendAppendRequests(sk))
 			return;
 
 	if (events & WL_SOCKET_READABLE)
-		if (!RecvAppendResponses(wk))
+		if (!RecvAppendResponses(sk))
 			return;
 
 	/*
 	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
 	 * in the buffer.
 	 * 
-	 * wk->currMsg checks if we have pending unsent messages. This check isn't
+	 * sk->currMsg checks if we have pending unsent messages. This check isn't
 	 * necessary now, because we always send queue messages immediately after
 	 * creation. But it's good to have it here in case we change this behavior
 	 * in the future.
 	 */
-	if (wk->currMsg != NULL || wk->flushWrite)
+	if (sk->currMsg != NULL || sk->flushWrite)
 		newEvents |= WL_SOCKET_WRITEABLE;
 
-	UpdateEventSet(wk, newEvents);
+	UpdateEventSet(sk, newEvents);
 }
 
 /*
- * Send queue messages starting from wk->currMsg until the end or non-writable
+ * Send queue messages starting from sk->currMsg until the end or non-writable
  * socket, whichever comes first. Caller should take care of updating event set.
  * 
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
  */
 static bool
-SendAppendRequests(WalKeeper *wk)
+SendAppendRequests(Safekeeper *sk)
 {
-	int wki = wk - walkeeper;
+	int wki = sk - safekeeper;
 	WalMessage *msg;
 	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
 
-	if (wk->flushWrite)
+	if (sk->flushWrite)
 	{
-		if (!AsyncFlush(wk))
+		if (!AsyncFlush(sk))
 			/* 
 			 * AsyncFlush failed, that could happen if the socket is closed or
 			 * we have nothing to write and should wait for writeable socket.
 			 */
-			return wk->state == SS_ACTIVE;
+			return sk->state == SS_ACTIVE;
 
 		/* Event set will be updated in the end of HandleActiveState */
-		wk->flushWrite = false;
+		sk->flushWrite = false;
 	}
 
-	while (wk->currMsg)
+	while (sk->currMsg)
 	{
-		msg = wk->currMsg;
+		msg = sk->currMsg;
 		req = &msg->req;
 
 		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
@@ -1620,20 +1620,20 @@ SendAppendRequests(WalKeeper *wk)
 		 * form the cut version. Only happens for the first
 		 * message.
 		 */
-		if (wk->startStreamingAt > msg->req.beginLsn)
+		if (sk->startStreamingAt > msg->req.beginLsn)
 		{
 			uint32		len;
 			uint32		size;
 
-			Assert(wk->startStreamingAt < req->endLsn);
+			Assert(sk->startStreamingAt < req->endLsn);
 
-			len = msg->req.endLsn - wk->startStreamingAt;
+			len = msg->req.endLsn - sk->startStreamingAt;
 			size = sizeof(AppendRequestHeader) + len;
 			req = malloc(size);
 			*req = msg->req;
-			req->beginLsn = wk->startStreamingAt;
+			req->beginLsn = sk->startStreamingAt;
 			memcpy(req + 1,
-					(char *) (&msg->req + 1) + wk->startStreamingAt -
+					(char *) (&msg->req + 1) + sk->startStreamingAt -
 					msg->req.beginLsn,
 					len);
 		}
@@ -1644,25 +1644,25 @@ SendAppendRequests(WalKeeper *wk)
 				LSN_FORMAT_ARGS(req->beginLsn),
 				LSN_FORMAT_ARGS(req->endLsn),
 				LSN_FORMAT_ARGS(req->commitLsn),
-				LSN_FORMAT_ARGS(truncateLsn), wk->host, wk->port);
+				LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port);
 
 		/* if this is the first sent message, we should start processing feedback */
-		if (wk->ackMsg == NULL)
-			wk->ackMsg = wk->currMsg;
+		if (sk->ackMsg == NULL)
+			sk->ackMsg = sk->currMsg;
 
 		/*
 		 * We write with msg->size here because the body of the
 		 * message is stored after the end of the WalMessage
 		 * struct, in the allocation for each msg
 		 */
-		writeResult = walprop_async_write(wk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
+		writeResult = walprop_async_write(sk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
 		
 		/* Free up resources */
 		if (req != &msg->req)
 			free(req);
 
 		/* Mark current message as sent, whatever the result is */
-		wk->currMsg = wk->currMsg->next;
+		sk->currMsg = sk->currMsg->next;
 
 		switch (writeResult)
 		{
@@ -1675,14 +1675,14 @@ SendAppendRequests(WalKeeper *wk)
 				 * We still need to call PQflush some more to finish the job.
 				 * Caller function will handle this by setting right event set.
 				 */
-				wk->flushWrite = true;
+				sk->flushWrite = true;
 				return true;
 
 			case PG_ASYNC_WRITE_FAIL:
 				elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					wk->host, wk->port, FormatWalKeeperState(wk->state),
-					walprop_error_message(wk->conn));
-				ShutdownConnection(wk);
+					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					walprop_error_message(sk->conn));
+				ShutdownConnection(sk);
 				return false;
 			default:
 				Assert(false);
@@ -1702,10 +1702,10 @@ SendAppendRequests(WalKeeper *wk)
  * NB: This function can call SendMessageToNode and produce new messages.
  */
 static bool
-RecvAppendResponses(WalKeeper *wk)
+RecvAppendResponses(Safekeeper *sk)
 {
 	XLogRecPtr	minQuorumLsn;
-	int wki = wk - walkeeper;
+	int wki = sk - safekeeper;
 	bool readAnything = false;
 
 	while (true)
@@ -1715,36 +1715,36 @@ RecvAppendResponses(WalKeeper *wk)
 		 * necessary error handling or state setting is taken care
 		 * of. We can leave any other work until later.
 		 */
-		wk->feedback.apm.tag = 'a';
-		if (!AsyncReadMessage(wk, (AcceptorProposerMessage *) &wk->feedback))
+		sk->appendResponse.apm.tag = 'a';
+		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		Assert(wk->ackMsg != NULL && (wk->ackMsg->ackMask & (1 << wki)) == 0);
+		Assert(sk->ackMsg != NULL && (sk->ackMsg->ackMask & (1 << wki)) == 0);
 
 		/*
-		 * We shouldn't read responses ahead of wk->currMsg, because that will
+		 * We shouldn't read responses ahead of sk->currMsg, because that will
 		 * look like we are receiving responses for messages that haven't been
 		 * sent yet.
 		 */
-		Assert(wk->ackMsg != wk->currMsg);
+		Assert(sk->ackMsg != sk->currMsg);
 
-		wk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
+		sk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
 											* receiving of this
 											* message */
 
-		wk->ackMsg = wk->ackMsg->next;
+		sk->ackMsg = sk->ackMsg->next;
 		readAnything = true;
 	}
 
 	if (!readAnything)
-		return wk->state == SS_ACTIVE;
+		return sk->state == SS_ACTIVE;
 
-	HandleWalKeeperResponse();
+	HandleSafekeeperResponse();
 
 	/*
-	 * Also send the new commit lsn to all the walkeepers.
+	 * Also send the new commit lsn to all the safekeepers.
 	 *
-	 * FIXME: This is redundant for walkeepers that have other
+	 * FIXME: This is redundant for safekeepers that have other
 	 * outbound messages pending.
 	 */
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
@@ -1754,11 +1754,11 @@ RecvAppendResponses(WalKeeper *wk)
 		lastSentCommitLsn = minQuorumLsn;
 	}
 
-	return wk->state == SS_ACTIVE;
+	return sk->state == SS_ACTIVE;
 }
 
 /*
- * Combine hot standby feedbacks from all walkeepers.
+ * Combine hot standby feedbacks from all safekeepers.
  */
 static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
@@ -1767,19 +1767,19 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	hs->xmin.value = ~0;		/* largest unsigned value */
 	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
 
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].feedback.hs.ts != 0)
+		if (safekeeper[i].appendResponse.hs.ts != 0)
 		{
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.xmin, hs->xmin))
+			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.xmin, hs->xmin))
 			{
-				hs->xmin = walkeeper[i].feedback.hs.xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
+				hs->xmin = safekeeper[i].appendResponse.hs.xmin;
+				hs->ts = safekeeper[i].appendResponse.hs.ts;
 			}
-			if (FullTransactionIdPrecedes(walkeeper[i].feedback.hs.catalog_xmin, hs->catalog_xmin))
+			if (FullTransactionIdPrecedes(safekeeper[i].appendResponse.hs.catalog_xmin, hs->catalog_xmin))
 			{
-				hs->catalog_xmin = walkeeper[i].feedback.hs.catalog_xmin;
-				hs->ts = walkeeper[i].feedback.hs.ts;
+				hs->catalog_xmin = safekeeper[i].appendResponse.hs.catalog_xmin;
+				hs->ts = safekeeper[i].appendResponse.hs.ts;
 			}
 		}
 	}
@@ -1792,11 +1792,11 @@ static XLogRecPtr
 CalculateDiskConsistentLsn(void)
 {
 	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].feedback.diskConsistentLsn < lsn)
+		if (safekeeper[i].appendResponse.diskConsistentLsn < lsn)
 		{
-			lsn = walkeeper[i].feedback.diskConsistentLsn;
+			lsn = safekeeper[i].appendResponse.diskConsistentLsn;
 		}
 	}
 	return lsn;
@@ -1810,10 +1810,10 @@ static XLogRecPtr
 CalculateMinFlushLsn(void)
 {
 	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (walkeeper[i].feedback.flushLsn < lsn)
-			lsn = walkeeper[i].feedback.flushLsn;
+		if (safekeeper[i].appendResponse.flushLsn < lsn)
+			lsn = safekeeper[i].appendResponse.flushLsn;
 	}
 	return lsn;
 }
@@ -1824,30 +1824,30 @@ CalculateMinFlushLsn(void)
 static XLogRecPtr
 GetAcknowledgedByQuorumWALPosition(void)
 {
-	XLogRecPtr	responses[MAX_WALKEEPERS];
+	XLogRecPtr	responses[MAX_SAFEKEEPERS];
 
 	/*
 	 * Sort acknowledged LSNs
 	 */
-	for (int i = 0; i < n_walkeepers; i++)
+	for (int i = 0; i < n_safekeepers; i++)
 	{
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
 		 * terms, so ignore reported LSN until it gets to epochStartLsn.
 		 */
-		responses[i] = walkeeper[i].feedback.flushLsn >= propEpochStartLsn ?
-			walkeeper[i].feedback.flushLsn : 0;
+		responses[i] = safekeeper[i].appendResponse.flushLsn >= propEpochStartLsn ?
+			safekeeper[i].appendResponse.flushLsn : 0;
 	}
-	qsort(responses, n_walkeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
 
 	/*
 	 * Get the smallest LSN committed by quorum
 	 */
-	return responses[n_walkeepers - quorum];
+	return responses[n_safekeepers - quorum];
 }
 
 static void
-HandleWalKeeperResponse(void)
+HandleSafekeeperResponse(void)
 {
 	HotStandbyFeedback hsFeedback;
 	XLogRecPtr	minQuorumLsn;
@@ -1857,30 +1857,30 @@ HandleWalKeeperResponse(void)
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	diskConsistentLsn = CalculateDiskConsistentLsn();
 
-	if (minQuorumLsn > lastFeedback.flushLsn || diskConsistentLsn != lastFeedback.diskConsistentLsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.diskConsistentLsn)
 	{
 
-		if (minQuorumLsn > lastFeedback.flushLsn)
-			lastFeedback.flushLsn = minQuorumLsn;
+		if (minQuorumLsn > quorumFeedback.flushLsn)
+			quorumFeedback.flushLsn = minQuorumLsn;
 
-		lastFeedback.diskConsistentLsn = diskConsistentLsn;
+		quorumFeedback.diskConsistentLsn = diskConsistentLsn;
 
 		/* advance the replication slot */
 		if (!syncSafekeepers)
 			ProcessStandbyReply(
 								// write_lsn -  This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
+								quorumFeedback.flushLsn,
 								//flush_lsn - This is what durably stored in WAL service.
-								lastFeedback.flushLsn,
+								quorumFeedback.flushLsn,
 								//apply_lsn - This is what processed and durably saved at pageserver.
-								lastFeedback.diskConsistentLsn,
+								quorumFeedback.diskConsistentLsn,
 								GetCurrentTimestamp(), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &lastFeedback.hs, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
 	{
-		lastFeedback.hs = hsFeedback;
+		quorumFeedback.hs = hsFeedback;
 		if (!syncSafekeepers)
 			ProcessStandbyHSFeedback(hsFeedback.ts,
 									 XidFromFullTransactionId(hsFeedback.xmin),
@@ -1909,7 +1909,7 @@ HandleWalKeeperResponse(void)
 		truncateLsn = minFlushLsn;
 
 	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_walkeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_safekeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
 	{
 		WalMessage *msg = msgQueueHead;
 		msgQueueHead = msg->next;
@@ -1936,28 +1936,28 @@ HandleWalKeeperResponse(void)
 	 * moment we don't have any good mechanism of defining the healthy and
 	 * most advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
-	 * (due to pageserver connecting to not-synced-walkeeper) we currently
-	 * wait for all seemingly alive walkeepers to get synced.
+	 * (due to pageserver connecting to not-synced-safekeeper) we currently
+	 * wait for all seemingly alive safekeepers to get synced.
 	 */
 	if (syncSafekeepers)
 	{
 		int			n_synced;
 
 		n_synced = 0;
-		for (int i = 0; i < n_walkeepers; i++)
+		for (int i = 0; i < n_safekeepers; i++)
 		{
-			WalKeeper  *wk = &walkeeper[i];
-			bool		synced = wk->feedback.commitLsn >= propEpochStartLsn;
+			Safekeeper  *sk = &safekeeper[i];
+			bool		synced = sk->appendResponse.commitLsn >= propEpochStartLsn;
 
 			/* alive safekeeper which is not synced yet; wait for it */
-			if (wk->state != SS_OFFLINE && !synced)
+			if (sk->state != SS_OFFLINE && !synced)
 				return;
 			if (synced)
 				n_synced++;
 		}
 		if (n_synced >= quorum)
 		{
-			/* All walkeepers synced! */
+			/* All safekeepers synced! */
 			fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(propEpochStartLsn));
 			exit(0);
 		}
@@ -1969,9 +1969,9 @@ HandleWalKeeperResponse(void)
  * failure.
  */
 static bool
-AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
+AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 {
-	switch (walprop_async_read(wk->conn, buf, buf_size))
+	switch (walprop_async_read(sk->conn, buf, buf_size))
 	{
 		case PG_ASYNC_READ_SUCCESS:
 			return true;
@@ -1981,10 +1981,10 @@ AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", wk->host,
-				 wk->port, FormatWalKeeperState(wk->state),
-				 walprop_error_message(wk->conn));
-			ShutdownConnection(wk);
+			elog(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
+				 sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
 			return false;
 	}
 	Assert(false);
@@ -2001,14 +2001,14 @@ AsyncRead(WalKeeper *wk, char **buf, int *buf_size)
  * failed, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
+AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 {
 	char *buf;
 	int buf_size;
 	uint64 tag;
 	StringInfoData s;
 
-	if (!(AsyncRead(wk, &buf, &buf_size)))
+	if (!(AsyncRead(sk, &buf, &buf_size)))
 		return false;
 
 	/* parse it */
@@ -2019,9 +2019,9 @@ AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, wk->host,
-			 wk->port, FormatWalKeeperState(wk->state));
-		ResetConnection(wk);
+		elog(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			 sk->port, FormatSafekeeperState(sk->state));
+		ResetConnection(sk);
 		return false;
 	}
 
@@ -2083,43 +2083,43 @@ AsyncReadMessage(WalKeeper *wk, AcceptorProposerMessage *anymsg)
  * single packet.
  */
 static bool
-BlockingWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState success_state)
+BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
 {
 	uint32		events;
 
-	if (!walprop_blocking_write(wk->conn, msg, msg_size))
+	if (!walprop_blocking_write(sk->conn, msg, msg_size))
 	{
 		elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-			 wk->host, wk->port, FormatWalKeeperState(wk->state),
-			 walprop_error_message(wk->conn));
-		ShutdownConnection(wk);
+			 sk->host, sk->port, FormatSafekeeperState(sk->state),
+			 walprop_error_message(sk->conn));
+		ShutdownConnection(sk);
 		return false;
 	}
 
-	wk->state = success_state;
+	sk->state = success_state;
 
 	/*
 	 * If the new state will be waiting for events to happen, update the event
 	 * set to wait for those
 	 */
-	events = WalKeeperStateDesiredEvents(success_state);
+	events = SafekeeperStateDesiredEvents(success_state);
 	if (events)
-		UpdateEventSet(wk, events);
+		UpdateEventSet(sk, events);
 
 	return true;
 }
 
 /*
- * Starts a write into the 'i'th WAL keeper's postgres connection, moving to
+ * Starts a write into the 'i'th safekeeper's postgres connection, moving to
  * flush_state (adjusting eventset) if write still needs flushing.
  *
  * Returns false if sending is unfinished (requires flushing or conn failed).
  * Upon failure, a warning is emitted and the connection is reset.
  */
 static bool
-AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state)
+AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state)
 {
-	switch (walprop_async_write(wk->conn, msg, msg_size))
+	switch (walprop_async_write(sk->conn, msg, msg_size))
 	{
 		case PG_ASYNC_WRITE_SUCCESS:
 			return true;
@@ -2130,14 +2130,14 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
 			 * to the appropriate state. Update the event set at the bottom of
 			 * this function
 			 */
-			wk->state = flush_state;
-			UpdateEventSet(wk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
+			sk->state = flush_state;
+			UpdateEventSet(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			elog(WARNING, "Failed to send to node %s:%s in %s state: %s",
-				 wk->host, wk->port, FormatWalKeeperState(wk->state),
-				 walprop_error_message(wk->conn));
-			ShutdownConnection(wk);
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ShutdownConnection(sk);
 			return false;
 		default:
 		    Assert(false);
@@ -2154,7 +2154,7 @@ AsyncWrite(WalKeeper *wk, void *msg, size_t msg_size, WalKeeperState flush_state
  * WL_SOCKET_WRITEABLE.
  */
 static bool
-AsyncFlush(WalKeeper *wk)
+AsyncFlush(Safekeeper *sk)
 {
 	/*---
 	 * PQflush returns:
@@ -2162,7 +2162,7 @@ AsyncFlush(WalKeeper *wk)
 	 *   1 if unable to send everything yet [call PQflush again]
 	 *  -1 if it failed                     [emit an error]
 	 */
-	switch (walprop_flush(wk->conn))
+	switch (walprop_flush(sk->conn))
 	{
 		case 0:
 			/* flush is done */
@@ -2172,9 +2172,9 @@ AsyncFlush(WalKeeper *wk)
 			return false;
 		case -1:
 			elog(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-				 wk->host, wk->port, FormatWalKeeperState(wk->state),
-				 walprop_error_message(wk->conn));
-			ResetConnection(wk);
+				 sk->host, sk->port, FormatSafekeeperState(sk->state),
+				 walprop_error_message(sk->conn));
+			ResetConnection(sk);
 			return false;
 		default:
 			Assert(false);
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 74ea1cfd5b1..37f8d2075f6 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -22,20 +22,20 @@ CompareLsn(const void *a, const void *b)
 		return 1;
 }
 
-/* Returns a human-readable string corresonding to the WalKeeperState
+/* Returns a human-readable string corresonding to the SafekeeperState
  *
  * The string should not be freed.
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   elog(LOG, "currently in %s state", FormatWalKeeperState(wk->state));
+ *   elog(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   elog(LOG, "currently in state [%s]", FormatWalKeeperState(wk->state));
+ *   elog(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
  */
 char*
-FormatWalKeeperState(WalKeeperState state)
+FormatSafekeeperState(SafekeeperState state)
 {
 	char* return_val = NULL;
 
@@ -76,11 +76,11 @@ FormatWalKeeperState(WalKeeperState state)
 	return return_val;
 }
 
-/* Asserts that the provided events are expected for given WAL keeper's state */
+/* Asserts that the provided events are expected for given safekeeper's state */
 void
-AssertEventsOkForState(uint32 events, WalKeeper* wk)
+AssertEventsOkForState(uint32 events, Safekeeper* sk)
 {
-	uint32 expected = WalKeeperStateDesiredEvents(wk->state);
+	uint32 expected = SafekeeperStateDesiredEvents(sk->state);
 
 	/* The events are in-line with what we're expecting, under two conditions:
 	 *   (a) if we aren't expecting anything, `events` has no read- or
@@ -99,17 +99,17 @@ AssertEventsOkForState(uint32 events, WalKeeper* wk)
 	{
 		/* To give a descriptive message in the case of failure, we use elog and
 		 * then an assertion that's guaranteed to fail. */
-		elog(WARNING, "events %s mismatched for walkeeper %s:%s in state [%s]",
-			 FormatEvents(events), wk->host, wk->port, FormatWalKeeperState(wk->state));
+		elog(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			 FormatEvents(events), sk->host, sk->port, FormatSafekeeperState(sk->state));
 		Assert(events_ok_for_state);
 	}
 }
 
-/* Returns the set of events a WAL keeper in this state should be waiting on
+/* Returns the set of events a safekeeper in this state should be waiting on
  *
  * This will return WL_NO_EVENTS (= 0) for some events. */
 uint32
-WalKeeperStateDesiredEvents(WalKeeperState state)
+SafekeeperStateDesiredEvents(SafekeeperState state)
 {
 	uint32 result;
 
@@ -143,7 +143,7 @@ WalKeeperStateDesiredEvents(WalKeeperState state)
 		 * Active state does both reading and writing.
 		 * 
 		 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We should
-		 * 	check wk->flushWrite here to set WL_SOCKET_WRITEABLE.
+		 * 	check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
 		 */
 		case SS_SEND_ELECTED_FLUSH:
 		case SS_ACTIVE:
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 9506a6ee887..51308cbe5a4 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -1,5 +1,5 @@
-#ifndef __WALKEEPER_H__
-#define __WALKEEPER_H__
+#ifndef __WALPROPOSER_H__
+#define __WALPROPOSER_H__
 
 #include "access/xlogdefs.h"
 #include "postgres.h"
@@ -13,7 +13,7 @@
 #define SK_MAGIC              0xCafeCeefu
 #define SK_PROTOCOL_VERSION   1
 
-#define MAX_WALKEEPERS        32
+#define MAX_SAFEKEEPERS        32
 #define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
 #define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
@@ -140,7 +140,7 @@ typedef enum
 	 * to read.
 	 */
 	SS_ACTIVE,
-} WalKeeperState;
+} SafekeeperState;
 
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
@@ -153,7 +153,7 @@ typedef uint64 term_t;
 typedef struct ProposerGreeting
 {
 	uint64	   tag;				  /* message tag */
-	uint32	   protocolVersion;	  /* proposer-walkeeper protocol version */
+	uint32	   protocolVersion;	  /* proposer-safekeeper protocol version */
 	uint32	   pgVersion;
 	pg_uuid_t  proposerId;
 	uint64	   systemId;		  /* Postgres system identifier */
@@ -210,7 +210,7 @@ typedef struct VoteResponse {
      * proposer to choose the most advanced one.
 	 */
 	XLogRecPtr flushLsn;
-	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some walkeeper */
+	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some safekeeper */
 	TermHistory termHistory;
 } VoteResponse;
 
@@ -229,7 +229,7 @@ typedef struct ProposerElected
 } ProposerElected;
 
 /*
- * Header of request with WAL message sent from proposer to walkeeper.
+ * Header of request with WAL message sent from proposer to safekeeper.
  */
 typedef struct AppendRequestHeader
 {
@@ -242,7 +242,7 @@ typedef struct AppendRequestHeader
 	XLogRecPtr epochStartLsn;
 	XLogRecPtr beginLsn;    /* start position of message in WAL */
 	XLogRecPtr endLsn;      /* end position of message in WAL */
-	XLogRecPtr commitLsn;   /* LSN committed by quorum of walkeepers */
+	XLogRecPtr commitLsn;   /* LSN committed by quorum of safekeepers */
 	/*
 	 *  minimal LSN which may be needed for recovery of some safekeeper (end lsn
 	 *  + 1 of last chunk streamed to everyone)
@@ -260,7 +260,7 @@ struct WalMessage
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
 	uint32 ackMask; /* mask of receivers acknowledged receiving of this message */
-	AppendRequestHeader req; /* request to walkeeper (message header) */
+	AppendRequestHeader req; /* request to safekeeper (message header) */
 
 	/* PHANTOM FIELD:
 	 *
@@ -280,7 +280,7 @@ typedef struct HotStandbyFeedback
 } HotStandbyFeedback;
 
 /*
- * Report walkeeper state to proposer
+ * Report safekeeper state to proposer
  */
 typedef struct AppendResponse
 {
@@ -302,9 +302,9 @@ typedef struct AppendResponse
 
 
 /*
- * Descriptor of walkeeper
+ * Descriptor of safekeeper
  */
-typedef struct WalKeeper
+typedef struct Safekeeper
 {
 	char const*        host;
 	char const*        port;
@@ -324,21 +324,21 @@ typedef struct WalKeeper
 	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
-	WalKeeperState     state;         /* walkeeper state machine state */
-	AcceptorGreeting   greet;         /* acceptor greeting  */
+	SafekeeperState     state;         /* safekeeper state machine state */
+	AcceptorGreeting   greetResponse;         /* acceptor greeting  */
 	VoteResponse	   voteResponse;  /* the vote */
-	AppendResponse feedback;		  /* feedback to master */
+	AppendResponse appendResponse;		  /* feedback to master */
 	/*
 	 * Streaming will start here; must be record boundary.
 	 */
 	XLogRecPtr startStreamingAt;
-} WalKeeper;
+} Safekeeper;
 
 
 int        CompareLsn(const void *a, const void *b);
-char*      FormatWalKeeperState(WalKeeperState state);
-void       AssertEventsOkForState(uint32 events, WalKeeper* wk);
-uint32     WalKeeperStateDesiredEvents(WalKeeperState state);
+char*      FormatSafekeeperState(SafekeeperState state);
+void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
+uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
 void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
@@ -385,7 +385,7 @@ typedef enum
 	WP_EXEC_SUCCESS_COPYBOTH,
 	/* Any success result other than a single CopyBoth was received. The specifics of the result
 	 * were already logged, but it may be useful to provide an error message indicating which
-	 * walkeeper messed up.
+	 * safekeeper messed up.
 	 *
 	 * Do not expect PQerrorMessage to be appropriately set. */
 	WP_EXEC_UNEXPECTED_SUCCESS,
@@ -441,11 +441,11 @@ typedef void (*walprop_finish_fn) (WalProposerConn* conn);
 /*
  * Ergonomic wrapper around PGgetCopyData
  *
- * Reads a CopyData block from a walkeeper, setting *amount to the number
+ * Reads a CopyData block from a safekeeper, setting *amount to the number
  * of bytes returned.
  *
  * This function is allowed to assume certain properties specific to the
- * protocol with the walkeepers, so it should not be used as-is for any
+ * protocol with the safekeepers, so it should not be used as-is for any
  * other purpose.
  *
  * Note: If possible, using <AsyncRead> is generally preferred, because it
@@ -459,7 +459,7 @@ typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
 /*
  * Ergonomic wrapper around PQputCopyData + PQflush
  *
- * Starts to write a CopyData block to a walkeeper.
+ * Starts to write a CopyData block to a safekeeper.
  *
  * For information on the meaning of return codes, refer to PGAsyncWriteResult.
  */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5bc3539d783..760f3842121 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2848,7 +2848,7 @@ WaitEventTimeout
 WaitPMResult
 WalCloseMethod
 WalLevel
-WalKeeper
+Safekeeper
 WalMessage
 WalRcvData
 WalRcvExecResult

From cb04c36b3f1a05994bd65382e0cafbd4e86c41fb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 09:54:56 +0300
Subject: [PATCH 091/167] Report back-pressure trottling status of backend

---
 src/backend/replication/walsender.c | 2 +-
 src/backend/tcop/postgres.c         | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 9f3b0bf64c0..141ee974349 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3787,7 +3787,7 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply
 uint64
 backpressure_lag(void)
 {
-	if (max_replication_apply_lag != 0 || max_replication_flush_lag != 0)
+	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0)
 	{
 		XLogRecPtr writePtr;
 		XLogRecPtr flushPtr;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 35523475281..4675f16c9fa 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3393,6 +3393,8 @@ ProcessInterrupts(void)
 		if (lag <= 0)
 			break;
 
+		set_ps_display("backpressure throttling");
+
 		elog(DEBUG2, "backpressure throttling: lag %lu", lag);
 		pg_usleep(BACK_PRESSURE_DELAY);
 	}

From 808848db200af93d94df37c9eabe8b546c3427dd Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 16:27:08 +0300
Subject: [PATCH 092/167] Add max_replication_write_lag

---
 src/backend/access/transam/xloginsert.c |  1 +
 src/backend/replication/walsender.c     |  9 ++++++++-
 src/backend/utils/misc/guc.c            | 12 ++++++++++++
 src/include/access/xloginsert.h         |  1 +
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 81bea0fb19e..2d3549d1e7f 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -66,6 +66,7 @@ typedef struct
 /* GUCs */
 int			max_replication_apply_lag;
 int			max_replication_flush_lag;
+int			max_replication_write_lag;
 
 static registered_buffer *registered_buffers;
 static int	max_registered_buffers; /* allocated size */
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 141ee974349..507318e113c 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3787,7 +3787,7 @@ GetMinReplicaLsn(XLogRecPtr* write_lsn, XLogRecPtr* flush_lsn, XLogRecPtr* apply
 uint64
 backpressure_lag(void)
 {
-	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0)
+	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
 	{
 		XLogRecPtr writePtr;
 		XLogRecPtr flushPtr;
@@ -3803,6 +3803,13 @@ backpressure_lag(void)
 			LSN_FORMAT_ARGS(flushPtr),
 			LSN_FORMAT_ARGS(applyPtr));
 
+		if ((writePtr != UnknownXLogRecPtr
+			&& max_replication_write_lag > 0
+			&& myFlushLsn > writePtr + max_replication_write_lag*MB))
+		{
+			return (myFlushLsn - writePtr - max_replication_write_lag*MB);
+		}
+
 		if ((flushPtr != UnknownXLogRecPtr
 			&& max_replication_flush_lag > 0
 			&& myFlushLsn > flushPtr + max_replication_flush_lag*MB))
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5d3b1e04fa4..95e2dbf873a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2923,6 +2923,18 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING,
+			gettext_noop("Maximal write lag between master and replicas."),
+			gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value,"
+						 "backends are blocked"),
+			GUC_UNIT_MB,
+		},
+		&max_replication_write_lag,
+		-1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING,
 			gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."),
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index 45dcaf99d9e..391c1a2716a 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -40,6 +40,7 @@
 
 extern int max_replication_apply_lag;
 extern int max_replication_flush_lag;
+extern int max_replication_write_lag;
 
 /* prototypes for public functions in xloginsert.c: */
 extern void XLogBeginInsert(void);

From 27f713e98f22644c6380f79a28e8d11fa5ac9160 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 10 Jan 2022 16:39:05 +0300
Subject: [PATCH 093/167] Do not throttle wal sender

---
 src/backend/tcop/postgres.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 4675f16c9fa..e5f760e2eb2 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3376,8 +3376,8 @@ ProcessInterrupts(void)
 	if (InterruptHoldoffCount != 0 || CritSectionCount != 0)
 		return;
 
-	// Don't throttle read only transactions
-	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	// Don't throttle read only transactions and wal sender
+	if (am_walsender && !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
 	{
 		ProcessInterrupts_pg();
 		return;

From 8f9af1880e319d161298e40b9941a28688d79218 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 11 Jan 2022 09:59:04 +0300
Subject: [PATCH 094/167] Do no throttle wal sender

---
 src/backend/tcop/postgres.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index e5f760e2eb2..cc97dabac8d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3377,7 +3377,7 @@ ProcessInterrupts(void)
 		return;
 
 	// Don't throttle read only transactions and wal sender
-	if (am_walsender && !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+	if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
 	{
 		ProcessInterrupts_pg();
 		return;

From c2e84048444b009c26f372d037d14da3afcc7bda Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 13 Jan 2022 20:26:07 +0300
Subject: [PATCH 095/167] Don't track acks in walproposer (#119)

---
 src/backend/replication/walproposer.c | 83 ++++++---------------------
 src/include/replication/walproposer.h |  2 -
 2 files changed, 19 insertions(+), 66 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 6f89c23eb2f..c1008440a76 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -133,7 +133,7 @@ static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr start
 static void SendProposerElected(Safekeeper *sk);
 static void WalProposerStartStreaming(XLogRecPtr startpos);
 static void StartStreaming(Safekeeper *sk);
-static void SendMessageToNode(int i, WalMessage *msg);
+static void SendMessageToNode(Safekeeper *sk, WalMessage *msg);
 static void BroadcastMessage(WalMessage *msg);
 static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
 static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
@@ -379,7 +379,6 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		initStringInfo(&safekeeper[n_safekeepers].outbuf);
 		safekeeper[n_safekeepers].flushWrite = false;
 		safekeeper[n_safekeepers].currMsg = NULL;
-		safekeeper[n_safekeepers].ackMsg = NULL;
 		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		n_safekeepers += 1;
 	}
@@ -513,7 +512,6 @@ ShutdownConnection(Safekeeper *sk)
 	sk->state = SS_OFFLINE;
 	sk->flushWrite = false;
 	sk->currMsg = NULL;
-	sk->ackMsg = NULL;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
@@ -1394,7 +1392,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 StartStreaming(Safekeeper *sk)
 {
-	int wki = sk - safekeeper;
+	WalMessage *startMsg = msgQueueHead;
 
 	/* 
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
@@ -1402,23 +1400,14 @@ StartStreaming(Safekeeper *sk)
 	 */
 	sk->state = SS_ACTIVE;
 
-	for (WalMessage *msg = msgQueueHead; msg != NULL; msg = msg->next)
-	{
-		if (msg->req.endLsn <= sk->startStreamingAt)
-		{
-			/* message is already received by this safekeeper */
-			msg->ackMask |= 1 << wki;
-		}
-		else
-		{
-			/* event set will be updated inside SendMessageToNode */
-			SendMessageToNode(wki, msg);
-			return;
-		}
-	}
+	while (startMsg != NULL && startMsg->req.endLsn <= sk->startStreamingAt)
+		startMsg = startMsg->next;
+
+	/* We should always have WAL to start from sk->startStreamingAt */
+	Assert(startMsg == NULL || startMsg->req.beginLsn <= sk->startStreamingAt);
 
-	/* Call SS_ACTIVE handler to update event set */
-	HandleActiveState(sk, WL_NO_EVENTS);
+	/* event set will be updated inside SendMessageToNode */
+	SendMessageToNode(sk, startMsg);
 }
 
 /*
@@ -1428,21 +1417,12 @@ StartStreaming(Safekeeper *sk)
  * in case of errors.
  */
 static void
-SendMessageToNode(int i, WalMessage *msg)
+SendMessageToNode(Safekeeper *sk, WalMessage *msg)
 {
-	Safekeeper  *sk = &safekeeper[i];
-
 	/* we shouldn't be already sending something */
 	Assert(sk->currMsg == NULL);
 	Assert(sk->state == SS_ACTIVE);
 
-	/*
-	 * Skip already acknowledged messages. Used after reconnection to get to
-	 * the first not yet sent message. Otherwise we always just send 'msg'.
-	 */
-	while (msg != NULL && (msg->ackMask & (1 << i)) != 0)
-		msg = msg->next;
-
 	sk->currMsg = msg;
 
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
@@ -1459,7 +1439,7 @@ BroadcastMessage(WalMessage *msg)
 	{
 		if (safekeeper[i].state == SS_ACTIVE && safekeeper[i].currMsg == NULL)
 		{
-			SendMessageToNode(i, msg);
+			SendMessageToNode(&safekeeper[i], msg);
 		}
 	}
 }
@@ -1488,7 +1468,6 @@ CreateMessage(XLogRecPtr startpos, char *data, int len)
 
 	msg->size = sizeof(AppendRequestHeader) + len;
 	msg->next = NULL;
-	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -1521,7 +1500,6 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 
 	msg->size = sizeof(AppendRequestHeader);
 	msg->next = NULL;
-	msg->ackMask = 0;
 	msg->req.tag = 'a';
 	msg->req.term = propTerm;
 	msg->req.epochStartLsn = propEpochStartLsn;
@@ -1541,7 +1519,7 @@ CreateMessageCommitLsnOnly(XLogRecPtr lsn)
 
 	/*
 	 * truncateLsn and commitLsn are set just before the message sent, in
-	 * SendMessageToNode()
+	 * SendAppendRequests()
 	 */
 	return msg;
 }
@@ -1587,7 +1565,6 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 static bool
 SendAppendRequests(Safekeeper *sk)
 {
-	int wki = sk - safekeeper;
 	WalMessage *msg;
 	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
@@ -1613,8 +1590,6 @@ SendAppendRequests(Safekeeper *sk)
 		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
 		req->truncateLsn = truncateLsn;
 
-		Assert((msg->ackMask & (1 << wki)) == 0);
-
 		/*
 		 * If we need to send this message not from the beginning,
 		 * form the cut version. Only happens for the first
@@ -1646,10 +1621,6 @@ SendAppendRequests(Safekeeper *sk)
 				LSN_FORMAT_ARGS(req->commitLsn),
 				LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port);
 
-		/* if this is the first sent message, we should start processing feedback */
-		if (sk->ackMsg == NULL)
-			sk->ackMsg = sk->currMsg;
-
 		/*
 		 * We write with msg->size here because the body of the
 		 * message is stored after the end of the WalMessage
@@ -1705,7 +1676,6 @@ static bool
 RecvAppendResponses(Safekeeper *sk)
 {
 	XLogRecPtr	minQuorumLsn;
-	int wki = sk - safekeeper;
 	bool readAnything = false;
 
 	while (true)
@@ -1719,20 +1689,6 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		Assert(sk->ackMsg != NULL && (sk->ackMsg->ackMask & (1 << wki)) == 0);
-
-		/*
-		 * We shouldn't read responses ahead of sk->currMsg, because that will
-		 * look like we are receiving responses for messages that haven't been
-		 * sent yet.
-		 */
-		Assert(sk->ackMsg != sk->currMsg);
-
-		sk->ackMsg->ackMask |= 1 << wki; /* this safekeeper confirms
-											* receiving of this
-											* message */
-
-		sk->ackMsg = sk->ackMsg->next;
 		readAnything = true;
 	}
 
@@ -1908,8 +1864,11 @@ HandleSafekeeperResponse(void)
 	if (minFlushLsn > truncateLsn)
 		truncateLsn = minFlushLsn;
 
-	/* Cleanup message queue up to truncateLsn, but only messages received by everyone */
-	while (msgQueueHead != NULL && msgQueueHead->ackMask == ((1 << n_safekeepers) - 1) && msgQueueHead->req.endLsn <= truncateLsn)
+	/*
+	 * Cleanup message queue up to truncateLsn. These messages were processed
+	 * by all safekeepers because they all reported flushLsn greater than endLsn.
+	 */
+	while (msgQueueHead != NULL && msgQueueHead->req.endLsn < truncateLsn)
 	{
 		WalMessage *msg = msgQueueHead;
 		msgQueueHead = msg->next;
@@ -1919,13 +1878,9 @@ HandleSafekeeperResponse(void)
 	}
 	if (!msgQueueHead)			/* queue is empty */
 		msgQueueTail = NULL;
+
 	/* truncateLsn always points to the first chunk in the queue */
-	if (msgQueueHead)
-	{
-		/* Max takes care of special 0-sized messages */
-		Assert(truncateLsn >= msgQueueHead->req.beginLsn &&
-			   truncateLsn < Max(msgQueueHead->req.endLsn, msgQueueHead->req.beginLsn + 1));
-	}
+	Assert(msgQueueHead == NULL || (truncateLsn >= msgQueueHead->req.beginLsn && truncateLsn <= msgQueueHead->req.endLsn));
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 51308cbe5a4..53f1a6de2fe 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -259,7 +259,6 @@ struct WalMessage
 {
 	WalMessage* next;      /* L1 list of messages */
 	uint32 size;           /* message size */
-	uint32 ackMask; /* mask of receivers acknowledged receiving of this message */
 	AppendRequestHeader req; /* request to safekeeper (message header) */
 
 	/* PHANTOM FIELD:
@@ -321,7 +320,6 @@ typedef struct Safekeeper
 
 	bool               flushWrite;    /* set to true if we need to call AsyncFlush, to flush pending messages */
 	WalMessage*        currMsg;       /* message that wasn't sent yet or NULL, if we have nothing to send */
-	WalMessage*        ackMsg;        /* message waiting ack from the receiver */
 
 	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
 	SafekeeperState     state;         /* safekeeper state machine state */

From c68dc1fc7dbbf7b2b3f85a40f486652aaca68ebe Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 14 Jan 2022 20:33:19 +0200
Subject: [PATCH 096/167] Silence excessively noisy logging from walproposer.

In the passing, switch a few places to ereport() instead of elog(), to
avoid the overhead of constructing the string when it's not logged.

Fixes https://github.com/zenithdb/zenith/issues/1066
---
 src/backend/replication/walproposer.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index c1008440a76..2b25eb9b61f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1250,14 +1250,16 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 				rec_start_lsn = pg_ntoh64(rec_start_lsn);
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
 				(void) CreateMessage(rec_start_lsn, buf, len);
-				elog(DEBUG1, "Recover message %X/%X length %d",
-					 LSN_FORMAT_ARGS(rec_start_lsn), len);
+				ereport(DEBUG1,
+						(errmsg("Recover message %X/%X length %d",
+								LSN_FORMAT_ARGS(rec_start_lsn), len)));
 				if (rec_end_lsn >= endpos)
 					break;
 			}
 		}
-		elog(DEBUG1, "end of replication stream at %X/%X: %m",
-			 LSN_FORMAT_ARGS(rec_end_lsn));
+		ereport(DEBUG1,
+				(errmsg("end of replication stream at %X/%X: %m",
+						LSN_FORMAT_ARGS(rec_end_lsn))));
 		walrcv_disconnect(wrconn);
 	}
 	else
@@ -1613,13 +1615,13 @@ SendAppendRequests(Safekeeper *sk)
 					len);
 		}
 
-		elog(LOG,
-				"sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-				req->endLsn - req->beginLsn,
-				LSN_FORMAT_ARGS(req->beginLsn),
-				LSN_FORMAT_ARGS(req->endLsn),
-				LSN_FORMAT_ARGS(req->commitLsn),
-				LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port);
+		ereport(DEBUG2,
+				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port)));
 
 		/*
 		 * We write with msg->size here because the body of the

From 8916e66b4d5e8854e2112c3b190a95429bcdc17c Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Mon, 20 Dec 2021 22:20:00 +0300
Subject: [PATCH 097/167] Extend replication protocol with ZenithFeedback
 message.

Add extensible ZenithFeedback part to AppendResponse messages
Pass values sizes together with keys in ZenithFeedback message.

Add standby_status_update fields into ZenithFeedback.
Get rid of diskConsistentLsn field in AppendResponse, because now it is send via ZenithFeedback.
Fix calculation of diskConsistentLsn and instanceSize - take values from latest reply from pageserver
---
 src/backend/replication/walproposer.c | 132 +++++++++++++++++++++-----
 src/backend/replication/walsender.c   |  30 ++++++
 src/include/replication/walproposer.h |  24 ++++-
 3 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 2b25eb9b61f..a3d12f2eab0 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -141,7 +141,6 @@ static void HandleActiveState(Safekeeper *sk, uint32 events);
 static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
 static void CombineHotStanbyFeedbacks(HotStandbyFeedback * hs);
-static XLogRecPtr CalculateDiskConsistentLsn(void);
 static XLogRecPtr CalculateMinFlushLsn(void);
 static XLogRecPtr GetAcknowledgedByQuorumWALPosition(void);
 static void HandleSafekeeperResponse(void);
@@ -672,7 +671,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_OFFLINE:
 			elog(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-					sk->host, sk->port);
+				 sk->host, sk->port);
 			break;			/* actually unreachable, but prevents
 							 * -Wimplicit-fallthrough */
 
@@ -1715,6 +1714,72 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }
 
+void
+ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
+{
+	uint8 nkeys;
+	int i;
+	int32 len;
+
+	/* get number of custom keys */
+	nkeys = pq_getmsgbyte(reply_message);
+
+	for (i = 0; i < nkeys; i++)
+	{
+		const char *key = pq_getmsgstring(reply_message);
+		if (strcmp(key, "current_timeline_size") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->currentInstanceSize = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: current_timeline_size %lu",
+					zf->currentInstanceSize);
+		}
+		else if (strcmp(key, "ps_writelsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->ps_writelsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_writelsn %X/%X",
+					LSN_FORMAT_ARGS(zf->ps_writelsn));
+		}
+		else if (strcmp(key, "ps_flushlsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->ps_flushlsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_flushlsn %X/%X",
+					LSN_FORMAT_ARGS(zf->ps_flushlsn));
+		}
+		else if (strcmp(key, "ps_applylsn") == 0)
+		{
+				pq_getmsgint(reply_message, sizeof(int32)); // read value length
+				zf->ps_applylsn = pq_getmsgint64(reply_message);
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_applylsn %X/%X",
+					LSN_FORMAT_ARGS(zf->ps_applylsn));
+		}
+		else if (strcmp(key, "ps_replytime") == 0)
+		{
+			pq_getmsgint(reply_message, sizeof(int32)); // read value length
+			zf->ps_replytime = pq_getmsgint64(reply_message);
+			{
+				char	   *replyTimeStr;
+
+				/* Copy because timestamptz_to_str returns a static buffer */
+				replyTimeStr = pstrdup(timestamptz_to_str(zf->ps_replytime));
+				elog(DEBUG2, "ParseZenithFeedbackMessage: ps_replytime %lu reply_time: %s",
+					zf->ps_replytime, replyTimeStr);
+
+				pfree(replyTimeStr);
+			}
+		}
+		else
+		{
+			len = pq_getmsgint(reply_message, sizeof(int32)); // read value length
+			// Skip unknown keys to support backward compatibile protocol changes
+			elog(LOG, "ParseZenithFeedbackMessage: unknown key: %s len %d", key, len);
+			pq_getmsgbytes(reply_message, len);
+		};
+	}
+}
+
 /*
  * Combine hot standby feedbacks from all safekeepers.
  */
@@ -1743,22 +1808,6 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback * hs)
 	}
 }
 
-/*
- * Get minimum of disk consistent LSNs of all safekeepers
- */
-static XLogRecPtr
-CalculateDiskConsistentLsn(void)
-{
-	XLogRecPtr lsn = UnknownXLogRecPtr;
-	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].appendResponse.diskConsistentLsn < lsn)
-		{
-			lsn = safekeeper[i].appendResponse.diskConsistentLsn;
-		}
-	}
-	return lsn;
-}
 
 /*
  * Get minimum of flushed LSNs of all safekeepers, which is the LSN of the
@@ -1804,6 +1853,31 @@ GetAcknowledgedByQuorumWALPosition(void)
 	return responses[n_safekeepers - quorum];
 }
 
+/*
+ * Get ZenithFeedback fields from the most advanced safekeeper
+ */
+static void
+GetLatestZentihFeedback(ZenithFeedback *zf)
+{
+	int latest_safekeeper = 0;
+	uint64 replyTime = 0;
+	for (int i = 0; i < n_safekeepers; i++)
+	{
+		if (safekeeper[i].appendResponse.zf.ps_replytime > replyTime)
+		{
+			latest_safekeeper = i;
+			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
+			elog(LOG, "safekeeper[%d] replyTime %lu", i, replyTime);
+		}
+	}
+
+	zf->currentInstanceSize = safekeeper[latest_safekeeper].appendResponse.zf.currentInstanceSize;
+	zf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_writelsn;
+	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
+	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
+	zf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.zf.ps_replytime;
+}
+
 static void
 HandleSafekeeperResponse(void)
 {
@@ -1812,17 +1886,18 @@ HandleSafekeeperResponse(void)
 	XLogRecPtr	diskConsistentLsn;
 	XLogRecPtr  minFlushLsn;
 
+
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = CalculateDiskConsistentLsn();
+	diskConsistentLsn = quorumFeedback.zf.ps_flushlsn;
+	// Get ZenithFeedback fields from the most advanced safekeeper
+	GetLatestZentihFeedback(&quorumFeedback.zf);
 
-	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.diskConsistentLsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.zf.ps_flushlsn)
 	{
 
 		if (minQuorumLsn > quorumFeedback.flushLsn)
 			quorumFeedback.flushLsn = minQuorumLsn;
 
-		quorumFeedback.diskConsistentLsn = diskConsistentLsn;
-
 		/* advance the replication slot */
 		if (!syncSafekeepers)
 			ProcessStandbyReply(
@@ -1831,7 +1906,7 @@ HandleSafekeeperResponse(void)
 								//flush_lsn - This is what durably stored in WAL service.
 								quorumFeedback.flushLsn,
 								//apply_lsn - This is what processed and durably saved at pageserver.
-								quorumFeedback.diskConsistentLsn,
+								quorumFeedback.zf.ps_flushlsn,
 								GetCurrentTimestamp(), false);
 	}
 
@@ -2017,10 +2092,19 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 			msg->term = pq_getmsgint64_le(&s);
 			msg->flushLsn = pq_getmsgint64_le(&s);
 			msg->commitLsn = pq_getmsgint64_le(&s);
-			msg->diskConsistentLsn = pq_getmsgint64_le(&s);
 			msg->hs.ts = pq_getmsgint64_le(&s);
 			msg->hs.xmin.value = pq_getmsgint64_le(&s);
 			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
+			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
+			{
+				StringInfoData z;
+				z.data = buf + APPENDRESPONSE_FIXEDPART_SIZE;
+				z.len = buf_size - APPENDRESPONSE_FIXEDPART_SIZE;
+				z.cursor = 0;
+				ParseZenithFeedbackMessage(&s, &msg->zf);
+				//advance main StringInfo cursor, because it is checked in pq_getmsgend below
+				s.cursor += z.cursor;
+			}
 			pq_getmsgend(&s);
 			return true;
 		}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 507318e113c..f914d51deb3 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -239,6 +239,7 @@ void StartReplication(StartReplicationCmd *cmd);
 static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
+static void ProcessZenithFeedbackMessage(void);
 static void ProcessStandbyHSFeedbackMessage(void);
 static void ProcessRepliesIfAny(void);
 static void WalSndKeepalive(bool requestReply);
@@ -1848,6 +1849,10 @@ ProcessStandbyMessage(void)
 			ProcessStandbyHSFeedbackMessage();
 			break;
 
+		case 'z':
+			ProcessZenithFeedbackMessage();
+			break;
+
 		default:
 			ereport(COMMERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -1911,6 +1916,31 @@ ProcessStandbyReplyMessage(void)
 						applyPtr,
 						replyTime,
 						replyRequested);
+
+	elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X",
+					LSN_FORMAT_ARGS(writePtr));
+	elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X",
+					LSN_FORMAT_ARGS(flushPtr));
+	elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X",
+					LSN_FORMAT_ARGS(applyPtr));
+}
+
+// This message is a zenith extension of postgres replication protocol
+static void
+ProcessZenithFeedbackMessage(void)
+{
+	ZenithFeedback zf;
+
+	// consume message length
+	pq_getmsgint64(&reply_message);
+
+	ParseZenithFeedbackMessage(&reply_message, &zf);
+
+	ProcessStandbyReply(zf.ps_writelsn,
+						zf.ps_flushlsn,
+						zf.ps_applylsn,
+						zf.ps_replytime,
+						false);
 }
 
 void
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 53f1a6de2fe..9a4dd028e69 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -278,6 +278,18 @@ typedef struct HotStandbyFeedback
 	FullTransactionId catalog_xmin;
 } HotStandbyFeedback;
 
+
+typedef	struct ZenithFeedback
+{
+	// current size of the timeline on pageserver
+	uint64 currentInstanceSize;
+	// standby_status_update fields that safekeeper received from pageserver
+	XLogRecPtr ps_writelsn;
+	XLogRecPtr ps_flushlsn;
+	XLogRecPtr ps_applylsn;
+	TimestampTz ps_replytime;
+} ZenithFeedback;
+
 /*
  * Report safekeeper state to proposer
  */
@@ -294,11 +306,17 @@ typedef struct AppendResponse
 	// Safekeeper reports back his awareness about which WAL is committed, as
 	// this is a criterion for walproposer --sync mode exit
 	XLogRecPtr commitLsn;
-	// Part of WAL applied and written to the disk by all pageservers
-	XLogRecPtr diskConsistentLsn;
 	HotStandbyFeedback hs;
+	// Feedback recieved from pageserver includes standby_status_update fields
+	// and custom zenith feedback.
+	// This part of the message is extensible.
+	ZenithFeedback zf;
 } AppendResponse;
 
+// ZenithFeedback is extensible part of the message that is parsed separately
+// Other fields are fixed part
+#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, zf)
+
 
 /*
  * Descriptor of safekeeper
@@ -357,6 +375,8 @@ void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									uint32		feedbackEpoch,
 									TransactionId feedbackCatalogXmin,
 									uint32		feedbackCatalogEpoch);
+void ParseZenithFeedbackMessage(StringInfo reply_message,
+								ZenithFeedback *zf);
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 

From e7322f1e111d97cd1694e09feea46c077082ef0f Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 27 Jan 2022 17:23:54 +0300
Subject: [PATCH 098/167] Allow to join empty safekeeper to existing cluster
 (#123)

---
 src/backend/replication/walproposer.c | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index a3d12f2eab0..1ae567f90c6 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1319,6 +1319,34 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = propTermHistory.entries[0].lsn;
+
+		if (sk->startStreamingAt < truncateLsn)
+		{
+			/*
+			 * There's a gap between the WAL starting point and a truncateLsn,
+			 * which can't appear in a normal working cluster. That gap means
+			 * that all safekeepers reported that they have persisted WAL up
+			 * to the truncateLsn before, but now current safekeeper tells
+			 * otherwise.
+			 * 
+			 * Also we have a special condition here, which is empty safekeeper
+			 * with no history. In combination with a gap, that can happen when
+			 * we introduce a new safekeeper to the cluster. This is a rare case,
+			 * which is triggered manually for now, and should be treated with 
+			 * care.
+			 */
+
+			/*
+			 * truncateLsn will not change without ack from current safekeeper,
+			 * and it's aligned to the WAL record, so we can safely start
+			 * streaming from this point.
+			 */
+			sk->startStreamingAt = truncateLsn;
+
+			elog(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
+				 sk->host, sk->port, LSN_FORMAT_ARGS(propTermHistory.entries[0].lsn),
+				 LSN_FORMAT_ARGS(sk->startStreamingAt));
+		}
 	}
 	else
 	{

From 0df307177f5c723bf84329674c39c79a0b43e1dc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 20:23:51 +0300
Subject: [PATCH 099/167] Use local relation cache for smgr_exists

refer  #1077
---
 contrib/zenith/pagestore_smgr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 81aa2339779..d2fabd8de66 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -616,6 +616,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
 	ZenithResponse *resp;
+	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
@@ -642,6 +643,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
+	{
+		return true;
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -746,6 +752,9 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 * exist.
 	 */
 	mdunlink(rnode, forkNum, isRedo);
+	if (!RelFileNodeBackendIsTemp(rnode)) {
+		forget_cached_relsize(rnode.node, forkNum);
+	}
 }
 
 /*

From a5db530c28b1f7c17119a1be392ea473c8f92204 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 5 Feb 2022 01:25:47 +0300
Subject: [PATCH 100/167] Reduce walproposer logging after ca5e7beaf.

---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 1ae567f90c6..ff281c4e2ed 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1895,7 +1895,7 @@ GetLatestZentihFeedback(ZenithFeedback *zf)
 		{
 			latest_safekeeper = i;
 			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
-			elog(LOG, "safekeeper[%d] replyTime %lu", i, replyTime);
+			elog(DEBUG2, "safekeeper[%d] replyTime %lu", i, replyTime);
 		}
 	}
 

From c0845f9773e407c10ec436a5ef4145fea8ad25cb Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Mon, 20 Dec 2021 23:08:27 +0300
Subject: [PATCH 101/167] Implement cluster size quota for zenith compute node.
 Use GUC zenith.max_cluster_size to set the limit.

If limit is reached, extend requests will throw out-of-space error.
When current size is too close to the limit - throw a warning.

Do not apply size quota to autovacuum process

Add pg_cluster_size() funciton in zenith extension
---
 contrib/zenith/Makefile               |  3 ++-
 contrib/zenith/libpagestore.c         |  9 +++++++
 contrib/zenith/pagestore_client.h     |  1 +
 contrib/zenith/pagestore_smgr.c       | 32 ++++++++++++++++++++++++
 contrib/zenith/zenith--1.0.sql        |  7 ++++++
 contrib/zenith/zenith.c               | 33 +++++++++++++++++++++++++
 contrib/zenith/zenith_functions.c     | 35 +++++++++++++++++++++++++++
 src/backend/access/transam/xlog.c     | 25 +++++++++++++++++++
 src/backend/replication/walproposer.c | 12 ++++++---
 src/backend/replication/walsender.c   |  2 ++
 src/include/access/xlog.h             |  3 +++
 src/include/replication/walproposer.h |  2 +-
 12 files changed, 158 insertions(+), 6 deletions(-)
 create mode 100644 contrib/zenith/zenith--1.0.sql
 create mode 100644 contrib/zenith/zenith.c
 create mode 100644 contrib/zenith/zenith_functions.c

diff --git a/contrib/zenith/Makefile b/contrib/zenith/Makefile
index 4b706186fff..a4a60d7b88c 100644
--- a/contrib/zenith/Makefile
+++ b/contrib/zenith/Makefile
@@ -4,12 +4,13 @@
 MODULE_big = zenith
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o
+	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o zenith.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 
 EXTENSION = zenith
+DATA = zenith--1.0.sql
 PGFILEDESC = "zenith - cloud storage for PostgreSQL"
 
 ifdef USE_PGXS
diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 2caf5d74b6e..9e16fa2c6fd 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -335,6 +335,15 @@ _PG_init(void)
 							 0,
 							 NULL, NULL, NULL);
 
+	DefineCustomIntVariable("zenith.max_cluster_size",
+							"cluster size limit",
+							NULL,
+							&max_cluster_size,
+							-1, -1, MAX_KILOBYTES,
+							PGC_SIGHUP,
+							GUC_UNIT_BYTE,
+							NULL, NULL,	NULL);
+
 	relsize_hash_init();
 
 	if (page_server != NULL)
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index 3643971f254..c040c4b816b 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -133,6 +133,7 @@ extern char *callmemaybe_connstring;
 extern char *zenith_timeline;
 extern char *zenith_tenant;
 extern bool wal_redo;
+extern int32 max_cluster_size;
 
 extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
 extern void smgr_init_zenith(void);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index d2fabd8de66..814e26b91fd 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -62,6 +62,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "catalog/pg_tablespace_d.h"
+#include "postmaster/autovacuum.h"
 
 /*
  * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
@@ -91,6 +92,7 @@ char	   *callmemaybe_connstring;
 char	   *zenith_timeline;
 char	   *zenith_tenant;
 bool		wal_redo = false;
+int32		max_cluster_size;
 
 /* unlogged relation build states */
 typedef enum
@@ -771,6 +773,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			  char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
+	uint64 current_instance_size;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -789,6 +792,35 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	current_instance_size = GetZenithCurrentClusterSize();
+
+	// Do not limit autovacuum processes.
+	if (!IsAutoVacuumWorkerProcess() && max_cluster_size > 0)
+	{
+		if (current_instance_size >= max_cluster_size)
+			ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+					errmsg("could not extend file. Cluster size limit of %d bytes is reached",
+						max_cluster_size),
+					errhint("This limit is defined by zenith.max_cluster_size GUC")));
+		// Throw a warning if current size is too close to the limit.
+		// `too close' is now defined as 10%
+		else if (current_instance_size >= max_cluster_size*0.1)
+		{
+			ereport(WARNING,
+				(errmsg("Current cluster size %lu bytes is close to the limit of %d bytes. ",
+						current_instance_size, max_cluster_size),
+					errhint("This limit is defined by zenith.max_cluster_size GUC")));
+		}
+		else
+		{
+			ereport(WARNING,
+					(errmsg("Current cluster size %lu bytes is not close to the limit of %d bytes. ",
+							current_instance_size, max_cluster_size),
+						errhint("This limit is defined by zenith.max_cluster_size GUC")));
+		}
+	}
+
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
 
diff --git a/contrib/zenith/zenith--1.0.sql b/contrib/zenith/zenith--1.0.sql
new file mode 100644
index 00000000000..095104c1045
--- /dev/null
+++ b/contrib/zenith/zenith--1.0.sql
@@ -0,0 +1,7 @@
+\echo Use "CREATE EXTENSION zenith" to load this file. \quit
+
+CREATE FUNCTION pg_cluster_size()
+RETURNS bigint
+AS 'MODULE_PATHNAME', 'pg_cluster_size'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
\ No newline at end of file
diff --git a/contrib/zenith/zenith.c b/contrib/zenith/zenith.c
new file mode 100644
index 00000000000..3f2a6cee924
--- /dev/null
+++ b/contrib/zenith/zenith.c
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenith.c
+ *	  Utility functions to expose zenith specific information to user
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith/zenith.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+
+PG_FUNCTION_INFO_V1(pg_cluster_size);
+
+Datum
+pg_cluster_size(PG_FUNCTION_ARGS)
+{
+	int64		size;
+
+	size = GetZenithCurrentClusterSize();
+
+	if (size == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(size);
+}
\ No newline at end of file
diff --git a/contrib/zenith/zenith_functions.c b/contrib/zenith/zenith_functions.c
new file mode 100644
index 00000000000..3e2b137d205
--- /dev/null
+++ b/contrib/zenith/zenith_functions.c
@@ -0,0 +1,35 @@
+/*-------------------------------------------------------------------------
+ *
+ * zenith.c
+ *	  Utility functions to expose zenith specific information to user
+ *
+ * IDENTIFICATION
+ *	 contrib/zenith/zenith.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "access/xact.h"
+#include "access/clog.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(pg_cluster_size);
+
+Datum
+pg_cluster_size(PG_FUNCTION_ARGS)
+{
+	int64		size;
+
+	size = GetZenithCurrentClusterSize();
+
+	if (size == 0)
+		PG_RETURN_NULL();
+
+	PG_RETURN_INT64(size);
+}
\ No newline at end of file
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7c9bdce8dcc..a475b2ad622 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -749,6 +749,11 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 	XLogRecPtr  lastWrittenPageLSN;
 
+	/*
+	 * size of a timeline in zenith pageserver.
+	 * used to enforce timeline size limit.
+	 */
+	uint64 		zenithCurrentClusterSize;
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -8820,6 +8825,26 @@ SetLastWrittenPageLSN(XLogRecPtr lsn)
 }
 
 
+uint64
+GetZenithCurrentClusterSize(void)
+{
+	uint64 size;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	size = XLogCtl->zenithCurrentClusterSize;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return size;
+}
+
+
+void
+SetZenithCurrentClusterSize(uint64 size)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->zenithCurrentClusterSize = size;
+	SpinLockRelease(&XLogCtl->info_lck);
+}
+
 
 
 /*
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index ff281c4e2ed..4e10cce8c8b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1758,9 +1758,9 @@ ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
 		if (strcmp(key, "current_timeline_size") == 0)
 		{
 				pq_getmsgint(reply_message, sizeof(int32)); // read value length
-				zf->currentInstanceSize = pq_getmsgint64(reply_message);
+				zf->currentClusterSize = pq_getmsgint64(reply_message);
 				elog(DEBUG2, "ParseZenithFeedbackMessage: current_timeline_size %lu",
-					zf->currentInstanceSize);
+					zf->currentClusterSize);
 		}
 		else if (strcmp(key, "ps_writelsn") == 0)
 		{
@@ -1895,11 +1895,10 @@ GetLatestZentihFeedback(ZenithFeedback *zf)
 		{
 			latest_safekeeper = i;
 			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
-			elog(DEBUG2, "safekeeper[%d] replyTime %lu", i, replyTime);
 		}
 	}
 
-	zf->currentInstanceSize = safekeeper[latest_safekeeper].appendResponse.zf.currentInstanceSize;
+	zf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.zf.currentClusterSize;
 	zf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_writelsn;
 	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
 	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
@@ -1920,6 +1919,11 @@ HandleSafekeeperResponse(void)
 	// Get ZenithFeedback fields from the most advanced safekeeper
 	GetLatestZentihFeedback(&quorumFeedback.zf);
 
+	if (!syncSafekeepers)
+	{
+		SetZenithCurrentClusterSize(quorumFeedback.zf.currentClusterSize);
+	}
+
 	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.zf.ps_flushlsn)
 	{
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f914d51deb3..3b1c25416a7 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1936,6 +1936,8 @@ ProcessZenithFeedbackMessage(void)
 
 	ParseZenithFeedbackMessage(&reply_message, &zf);
 
+	SetZenithCurrentClusterSize(zf.currentClusterSize);
+
 	ProcessStandbyReply(zf.ps_writelsn,
 						zf.ps_flushlsn,
 						zf.ps_applylsn,
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 986eb957570..e34f1deaf6e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -353,6 +353,9 @@ extern void RemovePromoteSignalFiles(void);
 extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
 extern XLogRecPtr GetLastWrittenPageLSN(void);
 
+extern void SetZenithCurrentClusterSize(uint64 size);
+extern uint64 GetZenithCurrentClusterSize(void);
+
 extern bool PromoteIsTriggered(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 9a4dd028e69..9bd5d8d1508 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -282,7 +282,7 @@ typedef struct HotStandbyFeedback
 typedef	struct ZenithFeedback
 {
 	// current size of the timeline on pageserver
-	uint64 currentInstanceSize;
+	uint64 currentClusterSize;
 	// standby_status_update fields that safekeeper received from pageserver
 	XLogRecPtr ps_writelsn;
 	XLogRecPtr ps_flushlsn;

From 14aaa50728328e19d496d05ecb067bee9535bf30 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 9 Feb 2022 14:31:26 +0300
Subject: [PATCH 102/167] Revert "Use local relation cache for smgr_exists"

This reverts commit 45dd8911ec13fd47882685f6d81d7b73696e1b84.

It introduced stable test_isolation failure. There was an idea that adding
strict backpressure settings would help, as absense of this commit could behave
as natural backpressure, but that didn't help. No better fix is immediately
available, so let's revert until sorting this out.

ref https://github.com/zenithdb/zenith/issues/1238
ref https://github.com/zenithdb/zenith/pull/1239
---
 contrib/zenith/pagestore_smgr.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 814e26b91fd..17f0b3b1da4 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -618,7 +618,6 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
 	ZenithResponse *resp;
-	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
@@ -645,11 +644,6 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
-	{
-		return true;
-	}
-
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -754,9 +748,6 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 * exist.
 	 */
 	mdunlink(rnode, forkNum, isRedo);
-	if (!RelFileNodeBackendIsTemp(rnode)) {
-		forget_cached_relsize(rnode.node, forkNum);
-	}
 }
 
 /*

From 952a94e61ca8518bb89548d65e30092aa3e1fc37 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 9 Feb 2022 15:26:41 +0200
Subject: [PATCH 103/167] Change the unit of cluster size limit GUC to MB, and
 other fixes.

The GUC is a 32-bit integer, so if the base unit is bytes, the max
limit you can set is only 2 GB. Furthermore, the web console assumed
that the unit is in MB, and set it to 10000 meaning 10 GB, but in
reality it was set to just 10 kB.

Remove the WARNINGs related to cluster size limit. That was probably
supposed to be DEBUG5 or something, because it's extremely noisy
currently. You get the WARNING for *every block* when a relation is
extended.

Some kind of a WARNING when you approach the limit would make sense,
but it's difficult to do in a sensible way with WARNINGs from the
server. Firstly, most applications will ignore WARNINGs, in which case
they don't accomplish anything. If an application forwards them to the
user, that's not great either unless the application user happens to
be the DBA. If you're lucky, the WARNINGs end up in an application log
and the DBA is alerted, but printing the message for every relation
extension is too noisy for that too. An email alert would probably be
best, outside Postgres.

Also don't enforce the limit when extending a temporary or unlogged
relation. They don't count towards the cluster size limit, so it seems
weird to error out on them. And reword the error message a bit.

Fixes https://github.com/zenithdb/zenith/issues/1233
---
 contrib/zenith/libpagestore.c   |  4 ++--
 contrib/zenith/pagestore_smgr.c | 38 ++++++++++++---------------------
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index 9e16fa2c6fd..e26028dc6e7 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -339,9 +339,9 @@ _PG_init(void)
 							"cluster size limit",
 							NULL,
 							&max_cluster_size,
-							-1, -1, MAX_KILOBYTES,
+							-1, -1, INT_MAX,
 							PGC_SIGHUP,
-							GUC_UNIT_BYTE,
+							GUC_UNIT_MB,
 							NULL, NULL,	NULL);
 
 	relsize_hash_init();
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 17f0b3b1da4..c4dcff31fc3 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -764,7 +764,6 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			  char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
-	uint64 current_instance_size;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -783,33 +782,24 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	current_instance_size = GetZenithCurrentClusterSize();
-
-	// Do not limit autovacuum processes.
-	if (!IsAutoVacuumWorkerProcess() && max_cluster_size > 0)
+	/*
+	 * Check that the cluster size limit has not been exceeded.
+	 *
+	 * Temporary and unlogged relations are not included in the cluster size measured
+	 * by the page server, so ignore those. Autovacuum processes are also exempt.
+	 */
+	if (max_cluster_size > 0 &&
+		reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT &&
+		!IsAutoVacuumWorkerProcess())
 	{
-		if (current_instance_size >= max_cluster_size)
+		uint64		current_size = GetZenithCurrentClusterSize();
+
+		if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
 			ereport(ERROR,
 				(errcode(ERRCODE_DISK_FULL),
-					errmsg("could not extend file. Cluster size limit of %d bytes is reached",
-						max_cluster_size),
-					errhint("This limit is defined by zenith.max_cluster_size GUC")));
-		// Throw a warning if current size is too close to the limit.
-		// `too close' is now defined as 10%
-		else if (current_instance_size >= max_cluster_size*0.1)
-		{
-			ereport(WARNING,
-				(errmsg("Current cluster size %lu bytes is close to the limit of %d bytes. ",
-						current_instance_size, max_cluster_size),
+					errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
+						   max_cluster_size),
 					errhint("This limit is defined by zenith.max_cluster_size GUC")));
-		}
-		else
-		{
-			ereport(WARNING,
-					(errmsg("Current cluster size %lu bytes is not close to the limit of %d bytes. ",
-							current_instance_size, max_cluster_size),
-						errhint("This limit is defined by zenith.max_cluster_size GUC")));
-		}
 	}
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);

From d881913708576e143297437bf32ce1d5fcdd026f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 9 Feb 2022 16:11:43 +0200
Subject: [PATCH 104/167] Improve error handling while connecting to page
 server.

If anything goes wrong while establishing a connection, don't leak the
socket.

Also, if you get an error while sending the GetPage request, kill the
connection. It's not clear what state it's in, so better to reconnect.
---
 contrib/zenith/libpagestore.c | 131 +++++++++++++++++++++++-----------
 1 file changed, 88 insertions(+), 43 deletions(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index e26028dc6e7..d5e48fc89b3 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -40,7 +40,7 @@ void		_PG_init(void);
 		 errhidestmt(true), errhidecontext(true)))
 
 bool		connected = false;
-PGconn	   *pageserver_conn;
+PGconn	   *pageserver_conn = NULL;
 
 char	   *page_server_connstring_raw;
 
@@ -55,6 +55,8 @@ zenith_connect()
 	char	   *query;
 	int			ret;
 
+	Assert(!connected);
+
 	pageserver_conn = PQconnectdb(page_server_connstring);
 
 	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -62,6 +64,7 @@ zenith_connect()
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
 		ereport(ERROR,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg("[ZENITH_SMGR] could not establish connection"),
@@ -79,6 +82,8 @@ zenith_connect()
 		res = PQexec(pageserver_conn, query);
 		if (PQresultStatus(res) != PGRES_COMMAND_OK)
 		{
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
 			zenith_log(ERROR,
 					   "[ZENITH_SMGR] callmemaybe command failed");
 		}
@@ -88,8 +93,12 @@ zenith_connect()
 	query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
 	ret = PQsendQuery(pageserver_conn, query);
 	if (ret != 1)
+	{
+		PQfinish(pageserver_conn);
+		pageserver_conn = NULL;
 		zenith_log(ERROR,
 				   "[ZENITH_SMGR] failed to start dispatcher_loop on pageserver");
+	}
 
 	while (PQisBusy(pageserver_conn))
 	{
@@ -109,8 +118,15 @@ zenith_connect()
 		if (wc & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(pageserver_conn))
+			{
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+				PQfinish(pageserver_conn);
+				pageserver_conn = NULL;
+
 				zenith_log(ERROR, "[ZENITH_SMGR] failed to get handshake from pageserver: %s",
-						   PQerrorMessage(pageserver_conn));
+						   msg);
+			}
 		}
 	}
 
@@ -128,59 +144,88 @@ zenith_call(ZenithRequest *request)
 	StringInfoData resp_buff;
 	ZenithResponse *resp;
 
-	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	PG_TRY();
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
-	}
+		/* If the connection was lost for some reason, reconnect */
+		if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+		{
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			connected = false;
+		}
 
-	if (!connected)
-		zenith_connect();
+		if (!connected)
+			zenith_connect();
 
-	req_buff = zm_pack_request(request);
+		req_buff = zm_pack_request(request);
 
-	/* send request */
-	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
-	{
-		zenith_log(ERROR, "failed to send page request: %s",
-				   PQerrorMessage(pageserver_conn));
-	}
-	pfree(req_buff.data);
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output and
+		 * TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
+		{
+			zenith_log(ERROR, "failed to send page request: %s",
+					   PQerrorMessage(pageserver_conn));
+		}
+		pfree(req_buff.data);
 
-	if (message_level_is_interesting(PqPageStoreTrace))
-	{
-		char	   *msg = zm_to_string((ZenithMessage *) request);
+		if (message_level_is_interesting(PqPageStoreTrace))
+		{
+			char	   *msg = zm_to_string((ZenithMessage *) request);
 
-		zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
-		pfree(msg);
-	}
+			zenith_log(PqPageStoreTrace, "Sent request: %s", msg);
+			pfree(msg);
+		}
 
-	/* read response */
-	resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
-	resp_buff.cursor = 0;
+		/* read response */
+		resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
+		resp_buff.cursor = 0;
 
-	if (resp_buff.len == -1)
-		zenith_log(ERROR, "end of COPY");
-	else if (resp_buff.len == -2)
-		zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+		if (resp_buff.len == -1)
+			zenith_log(ERROR, "end of COPY");
+		else if (resp_buff.len == -2)
+			zenith_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 
-	resp = zm_unpack_response(&resp_buff);
-	PQfreemem(resp_buff.data);
+		resp = zm_unpack_response(&resp_buff);
+		PQfreemem(resp_buff.data);
 
-	if (message_level_is_interesting(PqPageStoreTrace))
-	{
-		char	   *msg = zm_to_string((ZenithMessage *) resp);
+		if (message_level_is_interesting(PqPageStoreTrace))
+		{
+			char	   *msg = zm_to_string((ZenithMessage *) resp);
 
-		zenith_log(PqPageStoreTrace, "Got response: %s", msg);
-		pfree(msg);
-	}
+			zenith_log(PqPageStoreTrace, "Got response: %s", msg);
+			pfree(msg);
+		}
 
-	/*
-	 * XXX: zm_to_string leak strings. Check with what memory contex all this
-	 * methods are called.
-	 */
+		/*
+		 * XXX: zm_to_string leak strings. Check with what memory contex all this
+		 * methods are called.
+		 */
+	}
+	PG_CATCH();
+	{
+		/*
+		 * If anything goes wrong while we were sending a request, it's not
+		 * clear what state the connection is in. For example, if we sent the
+		 * request but didn't receive a response yet, we might receive the
+		 * response some time later after we have already sent a new unrelated
+		 * request. Close the connection to avoid getting confused.
+		 */
+		if (connected)
+		{
+			zenith_log(LOG, "dropping connection to page server due to error");
+			PQfinish(pageserver_conn);
+			pageserver_conn = NULL;
+			connected = false;
+		}
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
 	return (ZenithResponse *) resp;
 }

From 580c034653de752af357976c503168787302074f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 9 Feb 2022 16:11:48 +0200
Subject: [PATCH 105/167] Make getpage requests interruptible.

Fixes https://github.com/zenithdb/zenith/issues/1224
---
 contrib/zenith/libpagestore.c | 41 ++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index d5e48fc89b3..d3e20fc6411 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -136,6 +136,45 @@ zenith_connect()
 	connected = true;
 }
 
+/*
+ * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
+ */
+static int
+call_PQgetCopyData(PGconn *conn, char **buffer)
+{
+	int			ret;
+
+retry:
+	ret = PQgetCopyData(conn, buffer, 1 /* async */);
+
+	if (ret == 0)
+	{
+		int			wc;
+
+		/* Sleep until there's something to do */
+		wc = WaitLatchOrSocket(MyLatch,
+							   WL_LATCH_SET | WL_SOCKET_READABLE |
+							   WL_EXIT_ON_PM_DEATH,
+							   PQsocket(conn),
+							   -1L, PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Data available in socket? */
+		if (wc & WL_SOCKET_READABLE)
+		{
+			if (!PQconsumeInput(conn))
+				zenith_log(ERROR, "could not get response from pageserver: %s",
+						   PQerrorMessage(conn));
+		}
+
+		goto retry;
+	}
+
+	return ret;
+}
+
 
 static ZenithResponse *
 zenith_call(ZenithRequest *request)
@@ -183,7 +222,7 @@ zenith_call(ZenithRequest *request)
 		}
 
 		/* read response */
-		resp_buff.len = PQgetCopyData(pageserver_conn, &resp_buff.data, 0);
+		resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
 		resp_buff.cursor = 0;
 
 		if (resp_buff.len == -1)

From e056fb6b7ea32ea69fe22c4e9f8ab5f9af951977 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 10 Feb 2022 11:27:35 +0200
Subject: [PATCH 106/167] Fix memory leak of messages received from
 safekeepers.

Fixes https://github.com/zenithdb/zenith/issues/822
---
 .../libpqwalproposer/libpqwalproposer.c       | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index f6714c08128..085f1fcfe57 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -12,6 +12,7 @@ struct WalProposerConn
 {
 	PGconn* pg_conn;
 	bool    is_nonblocking; /* whether the connection is non-blocking */
+	char   *recvbuf;	/* last received data from libpqprop_async_read */
 };
 
 /* Prototypes for exported functions */
@@ -112,6 +113,7 @@ libpqprop_connect_start(char* conninfo)
 	conn = palloc(sizeof(WalProposerConn));
 	conn->pg_conn = pg_conn;
 	conn->is_nonblocking = false; /* connections always start in blocking mode */
+	conn->recvbuf = NULL;
 	return conn;
 }
 
@@ -247,18 +249,36 @@ libpqprop_flush(WalProposerConn* conn)
 static void
 libpqprop_finish(WalProposerConn* conn)
 {
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
 	PQfinish(conn->pg_conn);
 	pfree(conn);
 }
 
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
 static PGAsyncReadResult
 libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 {
 	int result;
 
+	if (conn->recvbuf != NULL)
+	{
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
+	}
+
 	/* Call PQconsumeInput so that we have the data we need */
 	if (!PQconsumeInput(conn->pg_conn))
+	{
+		*amount = 0;
+		*buf = NULL;
 		return PG_ASYNC_READ_FAIL;
+	}
 
 	/* The docs for PQgetCopyData list the return values as:
 	 *      0 if the copy is still in progress, but no "complete row" is
@@ -272,9 +292,11 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(conn->pg_conn, buf, true))
+	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
 	{
 		case 0:
+			*amount = 0;
+			*buf = NULL;
 			return PG_ASYNC_READ_TRY_AGAIN;
 		case -1:
 		{
@@ -292,13 +314,18 @@ libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
 
 			/* If there was actually an error, it'll be properly reported by
 			 * calls to PQerrorMessage -- we don't have to do anything else */
+			*amount = 0;
+			*buf = NULL;
 			return PG_ASYNC_READ_FAIL;
 		}
 		case -2:
+			*amount = 0;
+			*buf = NULL;
 			return PG_ASYNC_READ_FAIL;
 		default:
 			/* Positive values indicate the size of the returned result */
 			*amount = result;
+			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }

From 8524105dc7ef2033cf93f5e064eeab1fcc4f1011 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 11 Feb 2022 14:43:26 +0300
Subject: [PATCH 107/167] Initialize pgxactoff for walproposer

refer #1244
---
 src/backend/replication/walproposer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4e10cce8c8b..7c7707e2192 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -174,6 +174,7 @@ WalProposerMain(Datum main_arg)
 	am_wal_proposer = true;
 	am_walsender = true;
 	InitWalSender();
+	InitProcessPhase2();
 
 	/* Create replication slot for WAL proposer if not exists */
 	if (SearchNamedReplicationSlot(WAL_PROPOSER_SLOT_NAME, false) == NULL)

From 7b5ee1c08667dab3dbdd7f20da1f33e2ae55b372 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 14 Feb 2022 19:53:07 +0300
Subject: [PATCH 108/167] Fix compilation warning after 8524105dc.

---
 src/backend/replication/walproposer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7c7707e2192..1ceae1d6dd9 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -50,6 +50,7 @@
 #include "postmaster/interrupt.h"
 #include "postmaster/postmaster.h"
 #include "storage/pmsignal.h"
+#include "storage/proc.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"

From d914790e6c2070dec2e35bd3135bb46acb510bf9 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 14 Feb 2022 22:07:18 +0300
Subject: [PATCH 109/167] Fix more compiler warnings.

---
 src/backend/postmaster/seccomp.c                          | 4 +++-
 .../replication/libpqwalproposer/libpqwalproposer.c       | 8 ++++++++
 src/backend/replication/walproposer_utils.c               | 6 +++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
index 4ff34ebbd66..03971a072cf 100644
--- a/src/backend/postmaster/seccomp.c
+++ b/src/backend/postmaster/seccomp.c
@@ -184,8 +184,10 @@ do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_action)
 static void
 die(int code, const char *str)
 {
+	/* work around gcc ignoring that it shouldn't warn on (void) result being unused */
+	ssize_t _unused pg_attribute_unused();
 	/* Best effort write to stderr */
-	(void)write(fileno(stderr), str, strlen(str));
+	_unused = write(fileno(stderr), str, strlen(str));
 
 	/* XXX: we don't want to run any atexit callbacks */
 	_exit(code);
diff --git a/src/backend/replication/libpqwalproposer/libpqwalproposer.c b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
index 085f1fcfe57..a12a2ee04bc 100644
--- a/src/backend/replication/libpqwalproposer/libpqwalproposer.c
+++ b/src/backend/replication/libpqwalproposer/libpqwalproposer.c
@@ -143,6 +143,10 @@ libpqprop_connect_poll(WalProposerConn* conn)
 			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
 			/* This return is never actually reached, but it's here to make the compiler happy */
 			return WP_CONN_POLLING_FAILED;
+
+		default:
+			Assert(false);
+			return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
 	}
 
 	return return_val;
@@ -226,6 +230,10 @@ libpqprop_get_query_result(WalProposerConn* conn)
 		case PGRES_PIPELINE_ABORTED:
 			return_val = WP_EXEC_FAILED;
 			break;
+
+		default:
+			Assert(false);
+			return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
 	}
 
 	if (unexpected_success)
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 37f8d2075f6..7a593a71778 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -111,7 +111,7 @@ AssertEventsOkForState(uint32 events, Safekeeper* sk)
 uint32
 SafekeeperStateDesiredEvents(SafekeeperState state)
 {
-	uint32 result;
+	uint32 result = WL_NO_EVENTS;
 
 	/* If the state doesn't have a modifier, we can check the base state */
 	switch (state)
@@ -154,6 +154,10 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
 		case SS_OFFLINE:
 			result = WL_NO_EVENTS;
 			break;
+
+		default:
+			Assert(false);
+			break;
 	}
 
 	return result;

From b426ffee02659bf86d09d7a6c61660ab3f0933e3 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 16 Feb 2022 12:09:50 +0200
Subject: [PATCH 110/167] Remove dead code in handling ZenithFeedback part of
 an AppendResponse.

The constructed StringInfoData 'z' variable wasn't used for anything, we
passed the original 's' StringInfo directly to ParseZenithFeedbackMessage.
That's fine, but let's remove the dead code.
---
 src/backend/replication/walproposer.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 1ceae1d6dd9..cf473d29215 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1744,6 +1744,7 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }
 
+/* Parse a ZenithFeedback message, or the ZenithFeedback part of an AppendResponse */
 void
 ParseZenithFeedbackMessage(StringInfo reply_message, ZenithFeedback *zf)
 {
@@ -2130,15 +2131,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 			msg->hs.xmin.value = pq_getmsgint64_le(&s);
 			msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
 			if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-			{
-				StringInfoData z;
-				z.data = buf + APPENDRESPONSE_FIXEDPART_SIZE;
-				z.len = buf_size - APPENDRESPONSE_FIXEDPART_SIZE;
-				z.cursor = 0;
 				ParseZenithFeedbackMessage(&s, &msg->zf);
-				//advance main StringInfo cursor, because it is checked in pq_getmsgend below
-				s.cursor += z.cursor;
-			}
 			pq_getmsgend(&s);
 			return true;
 		}

From ecbcefb4054db26de2b5781458a2d7987640b558 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 24 Dec 2021 15:32:50 +0300
Subject: [PATCH 111/167] Prevent recursive call of XLogBeginInsert

refer #1015
---
 src/backend/access/gin/ginfast.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index e0d99409461..2a52ae2650f 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -285,6 +285,15 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 		memset(&sublist, 0, sizeof(GinMetaPageData));
 		makeSublist(index, collector->tuples, collector->ntuples, &sublist);
 
+		if (metadata->head != InvalidBlockNumber)
+		{
+			/*
+			 * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
+			 * of XLogBeginInsert() by zenith_wallog_page->log_newpage_copy.
+			 */
+			buffer = ReadBuffer(index, metadata->tail);
+		}
+
 		if (needWal)
 			XLogBeginInsert();
 
@@ -316,7 +325,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 			data.prevTail = metadata->tail;
 			data.newRightlink = sublist.head;
 
-			buffer = ReadBuffer(index, metadata->tail);
 			LockBuffer(buffer, GIN_EXCLUSIVE);
 			page = BufferGetPage(buffer);
 

From d455371c5893db3d761af17a1fb37ff4827277f1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 16 Feb 2022 11:44:52 +0300
Subject: [PATCH 112/167] Update src/backend/access/gin/ginfast.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 src/backend/access/gin/ginfast.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index 2a52ae2650f..2d964c02e95 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -289,7 +289,9 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 		{
 			/*
 			 * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
-			 * of XLogBeginInsert() by zenith_wallog_page->log_newpage_copy.
+			 * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
+			 * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
+			 * will try to WAL-log an image of the page.
 			 */
 			buffer = ReadBuffer(index, metadata->tail);
 		}

From 2c2b36228e311c0fc213778756cb3e4a689de798 Mon Sep 17 00:00:00 2001
From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com>
Date: Fri, 18 Feb 2022 08:52:01 -0800
Subject: [PATCH 113/167] Expose reading a relation page at a specific LSN
 (#131)

* Expose reading a relation page at a specific LSN

* Addressing comments
---
 contrib/zenith/pagestore_client.h             |   4 +
 contrib/zenith/pagestore_smgr.c               |  69 ++++----
 .../zenith_test_utils--1.0.sql                |  10 ++
 contrib/zenith_test_utils/zenithtest.c        | 155 +++++++++++++++++-
 4 files changed, 208 insertions(+), 30 deletions(-)

diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index c040c4b816b..a5dcd1efc06 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -156,6 +156,10 @@ extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber blocknum);
 extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 						char *buffer);
+
+extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer);
+
 extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index c4dcff31fc3..7f3dc0bb09b 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -917,40 +917,20 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 }
 
 /*
- *	zenith_read() -- Read the specified block from a relation.
+ * While function is defined in the zenith extension it's used within zenith_test_utils directly.
+ * To avoid breaking tests in the runtime please keep function signature in sync.
  */
-void
-zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
-			char *buffer)
+void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer)
 {
 	ZenithResponse *resp;
-	bool		latest;
-	XLogRecPtr	request_lsn;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdread(reln, forkNum, blkno, buffer);
-			return;
-
-		default:
-			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithGetPageRequest request = {
 			.req.tag = T_ZenithGetPageRequest,
-			.req.latest = latest,
+			.req.latest = request_latest,
 			.req.lsn = request_lsn,
-			.rnode = reln->smgr_rnode.node,
+			.rnode = rnode,
 			.forknum = forkNum,
 			.blkno = blkno
 		};
@@ -969,9 +949,9 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							blkno,
-							reln->smgr_rnode.node.spcNode,
-							reln->smgr_rnode.node.dbNode,
-							reln->smgr_rnode.node.relNode,
+							rnode.spcNode,
+							rnode.dbNode,
+							rnode.relNode,
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
@@ -983,6 +963,37 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	}
 
 	pfree(resp);
+}
+
+/*
+ *	zenith_read() -- Read the specified block from a relation.
+ */
+void
+zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
+			char *buffer)
+{
+	bool		latest;
+	XLogRecPtr	request_lsn;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			elog(ERROR, "cannot call smgrread() on rel with unknown persistence");
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
index dbf18288fd4..d595b043abf 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -12,3 +12,13 @@ RETURNS VOID
 AS 'MODULE_PATHNAME', 'clear_buffer_cache'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;
+
+CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
+RETURNS bytea
+AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
+LANGUAGE C PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index 2d42110cf36..bd867755e61 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -9,17 +9,34 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
-#include "fmgr.h"
 
+#include "access/relation.h"
 #include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/namespace.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/varlena.h"
 
 
 PG_MODULE_MAGIC;
 
 PG_FUNCTION_INFO_V1(test_consume_xids);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
+PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
+PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
+
+/*
+ * This function is defined in the zenith extension, such declaration is fragile.
+ * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
+ */
+extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+			XLogRecPtr request_lsn, bool request_latest, char *buffer);
 
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
@@ -117,3 +134,139 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+
+/*
+ * Reads the page from page server without buffer cache
+ * usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN
+ * NULL read lsn will result in reading the latest version.
+ *
+ * Note: reading latest version will result in waiting for latest changes to reach the page server,
+ *       if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
+ */
+Datum
+get_raw_page_at_lsn(PG_FUNCTION_ARGS)
+{
+	bytea	   *raw_page;
+	ForkNumber	forknum;
+	RangeVar   *relrv;
+	Relation	rel;
+	char	   *raw_page_data;
+	text	   *relname;
+	text	   *forkname;
+	uint32		blkno;
+
+	bool request_latest = PG_ARGISNULL(3);
+	uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
+		PG_RETURN_NULL();
+
+	relname = PG_GETARG_TEXT_PP(0);
+	forkname = PG_GETARG_TEXT_PP(1);
+	blkno = PG_GETARG_UINT32(2);
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to use raw page functions")));
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	/* Check that this relation has storage */
+	if (rel->rd_rel->relkind == RELKIND_VIEW)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from view \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from composite type \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from foreign table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned index \"%s\"",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+
+	forknum = forkname_to_number(text_to_cstring(forkname));
+
+	/* Initialize buffer to copy to */
+	raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+	raw_page_data = VARDATA(raw_page);
+
+	zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
+
+	relation_close(rel, AccessShareLock);
+
+	PG_RETURN_BYTEA_P(raw_page);
+}
+
+/*
+ * Another option to read a relation page from page server without cache
+ * this version doesn't validate input and allows reading blocks of dropped relations
+ *
+ * Note: reading latest version will result in waiting for latest changes to reach the page server,
+ *  if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
+ */
+Datum
+get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
+{
+	char	   *raw_page_data;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				errmsg("must be superuser to use raw page functions")));
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) ||
+		PG_ARGISNULL(3) || PG_ARGISNULL(4))
+		PG_RETURN_NULL();
+
+	{
+		RelFileNode rnode = {
+			.spcNode = PG_GETARG_OID(0),
+			.dbNode  = PG_GETARG_OID(1),
+			.relNode = PG_GETARG_OID(2)
+		};
+
+		ForkNumber forknum = PG_GETARG_UINT32(3);
+
+		uint32 blkno = PG_GETARG_UINT32(4);
+		bool request_latest = PG_ARGISNULL(5);
+		uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
+
+
+		/* Initialize buffer to copy to */
+		bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+		raw_page_data = VARDATA(raw_page);
+
+		zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
+		PG_RETURN_BYTEA_P(raw_page);
+	}
+}

From a3709cc3643dd28c30b2b8f603ba3d60a586afb9 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Thu, 17 Feb 2022 19:16:49 +0300
Subject: [PATCH 114/167] Add backpressure_lsns() function. Fix zenith feedback
 processing

---
 contrib/zenith/zenith--1.0.sql        | 12 ++++-
 contrib/zenith/zenith.c               | 37 ++++++++++++++-
 contrib/zenith/zenith_functions.c     | 35 --------------
 src/backend/replication/walproposer.c | 67 +++++++++++++++++++++++++--
 src/backend/replication/walsender.c   |  6 ++-
 src/backend/storage/ipc/ipci.c        |  6 +++
 src/include/replication/walproposer.h | 12 +++++
 7 files changed, 130 insertions(+), 45 deletions(-)
 delete mode 100644 contrib/zenith/zenith_functions.c

diff --git a/contrib/zenith/zenith--1.0.sql b/contrib/zenith/zenith--1.0.sql
index 095104c1045..e414be8ceea 100644
--- a/contrib/zenith/zenith--1.0.sql
+++ b/contrib/zenith/zenith--1.0.sql
@@ -4,4 +4,14 @@ CREATE FUNCTION pg_cluster_size()
 RETURNS bigint
 AS 'MODULE_PATHNAME', 'pg_cluster_size'
 LANGUAGE C STRICT
-PARALLEL UNSAFE;
\ No newline at end of file
+PARALLEL UNSAFE;
+
+CREATE FUNCTION backpressure_lsns(
+    OUT received_lsn pg_lsn,
+    OUT disk_consistent_lsn pg_lsn,
+    OUT remote_consistent_lsn pg_lsn
+)
+RETURNS record
+AS 'MODULE_PATHNAME', 'backpressure_lsns'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
diff --git a/contrib/zenith/zenith.c b/contrib/zenith/zenith.c
index 3f2a6cee924..e88984d918c 100644
--- a/contrib/zenith/zenith.c
+++ b/contrib/zenith/zenith.c
@@ -15,9 +15,15 @@
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
-
+#include "catalog/pg_type.h"
+#include "replication/walsender.h"
+#include "replication/walproposer.h"
+#include "funcapi.h"
+#include "access/htup_details.h"
+#include "utils/pg_lsn.h"
 
 PG_FUNCTION_INFO_V1(pg_cluster_size);
+PG_FUNCTION_INFO_V1(backpressure_lsns);
 
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -30,4 +36,31 @@ pg_cluster_size(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 
 	PG_RETURN_INT64(size);
-}
\ No newline at end of file
+}
+
+
+Datum
+backpressure_lsns(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr writePtr;
+	XLogRecPtr flushPtr;
+	XLogRecPtr applyPtr;
+	Datum		values[3];
+	bool		nulls[3];
+	TupleDesc	tupdesc;
+
+	zenith_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+
+	tupdesc = CreateTemplateTupleDesc(3);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+	values[0] = LSNGetDatum(writePtr);
+	values[1] = LSNGetDatum(flushPtr);
+	values[2] = LSNGetDatum(applyPtr);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
diff --git a/contrib/zenith/zenith_functions.c b/contrib/zenith/zenith_functions.c
deleted file mode 100644
index 3e2b137d205..00000000000
--- a/contrib/zenith/zenith_functions.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * zenith.c
- *	  Utility functions to expose zenith specific information to user
- *
- * IDENTIFICATION
- *	 contrib/zenith/zenith.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-#include "fmgr.h"
-
-#include "access/xact.h"
-#include "access/clog.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-
-
-PG_MODULE_MAGIC;
-
-PG_FUNCTION_INFO_V1(pg_cluster_size);
-
-Datum
-pg_cluster_size(PG_FUNCTION_ARGS)
-{
-	int64		size;
-
-	size = GetZenithCurrentClusterSize();
-
-	if (size == 0)
-		PG_RETURN_NULL();
-
-	PG_RETURN_INT64(size);
-}
\ No newline at end of file
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index cf473d29215..7752ae965d3 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -151,6 +151,7 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
 static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
 static bool AsyncFlush(Safekeeper *sk);
 
+
 /*
  * WAL proposer bgworker entry point.
  */
@@ -1884,6 +1885,52 @@ GetAcknowledgedByQuorumWALPosition(void)
 	return responses[n_safekeepers - quorum];
 }
 
+
+static ZenithFeedbackState *zf_state;
+
+/*
+ * ZenithFeedbackShmemSize --- report amount of shared memory space needed
+ */
+Size
+ZenithFeedbackShmemSize(void)
+{
+	return sizeof(ZenithFeedbackState);
+}
+
+bool
+ZenithFeedbackShmemInit(void)
+{
+	bool		found;
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	zf_state = ShmemInitStruct("Zenith Feedback",
+								sizeof(ZenithFeedbackState),
+								&found);
+	LWLockRelease(AddinShmemInitLock);
+
+	return found;
+}
+
+void
+zenith_feedback_set(ZenithFeedback *zf)
+{
+	SpinLockAcquire(&zf_state->mutex);
+	memcpy(&zf_state->feedback, zf, sizeof(ZenithFeedback));
+	SpinLockRelease(&zf_state->mutex);
+}
+
+
+void
+zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
+{
+	SpinLockAcquire(&zf_state->mutex);
+	*writeLsn = zf_state->feedback.ps_writelsn;
+	*flushLsn = zf_state->feedback.ps_flushlsn;
+	*applyLsn = zf_state->feedback.ps_applylsn;
+	SpinLockRelease(&zf_state->mutex);
+}
+
+
 /*
  * Get ZenithFeedback fields from the most advanced safekeeper
  */
@@ -1891,13 +1938,13 @@ static void
 GetLatestZentihFeedback(ZenithFeedback *zf)
 {
 	int latest_safekeeper = 0;
-	uint64 replyTime = 0;
+	XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
 	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (safekeeper[i].appendResponse.zf.ps_replytime > replyTime)
+		if (safekeeper[i].appendResponse.zf.ps_writelsn > ps_writelsn)
 		{
 			latest_safekeeper = i;
-			replyTime = safekeeper[i].appendResponse.zf.ps_replytime;
+			ps_writelsn = safekeeper[i].appendResponse.zf.ps_writelsn;
 		}
 	}
 
@@ -1906,6 +1953,16 @@ GetLatestZentihFeedback(ZenithFeedback *zf)
 	zf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_flushlsn;
 	zf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.zf.ps_applylsn;
 	zf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.zf.ps_replytime;
+
+	elog(DEBUG2, "GetLatestZentihFeedback: currentClusterSize %lu,"
+			  " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
+		zf->currentClusterSize,
+		LSN_FORMAT_ARGS(zf->ps_writelsn),
+		LSN_FORMAT_ARGS(zf->ps_flushlsn),
+		LSN_FORMAT_ARGS(zf->ps_applylsn),
+		zf->ps_replytime);
+
+	zenith_feedback_set(zf);
 }
 
 static void
@@ -1919,11 +1976,11 @@ HandleSafekeeperResponse(void)
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	diskConsistentLsn = quorumFeedback.zf.ps_flushlsn;
-	// Get ZenithFeedback fields from the most advanced safekeeper
-	GetLatestZentihFeedback(&quorumFeedback.zf);
 
 	if (!syncSafekeepers)
 	{
+		// Get ZenithFeedback fields from the most advanced safekeeper
+		GetLatestZentihFeedback(&quorumFeedback.zf);
 		SetZenithCurrentClusterSize(quorumFeedback.zf.currentClusterSize);
 	}
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 3b1c25416a7..f649302ba9e 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1936,6 +1936,8 @@ ProcessZenithFeedbackMessage(void)
 
 	ParseZenithFeedbackMessage(&reply_message, &zf);
 
+	zenith_feedback_set(&zf);
+
 	SetZenithCurrentClusterSize(zf.currentClusterSize);
 
 	ProcessStandbyReply(zf.ps_writelsn,
@@ -3826,10 +3828,10 @@ backpressure_lag(void)
 		XLogRecPtr applyPtr;
 		XLogRecPtr myFlushLsn = GetFlushRecPtr();
 
-		GetMinReplicaLsn(&writePtr, &flushPtr, &applyPtr);
+		zenith_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
 		#define MB ((XLogRecPtr)1024*1024)
 
-		elog(DEBUG2, "current flushLsn %X/%X StandbyReply: write %X/%X flush %X/%X apply %X/%X",
+		elog(DEBUG2, "current flushLsn %X/%X ZenithFeedback: write %X/%X flush %X/%X apply %X/%X",
 			LSN_FORMAT_ARGS(myFlushLsn),
 			LSN_FORMAT_ARGS(writePtr),
 			LSN_FORMAT_ARGS(flushPtr),
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3e4ec53a97e..5fb07a87eb8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -34,6 +34,7 @@
 #include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
+#include "replication/walproposer.h"
 #include "storage/bufmgr.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
@@ -150,6 +151,9 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
+
+		size = add_size(size, ZenithFeedbackShmemSize());
+
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
 #endif
@@ -270,6 +274,8 @@ CreateSharedMemoryAndSemaphores(void)
 	SyncScanShmemInit();
 	AsyncShmemInit();
 
+	ZenithFeedbackShmemInit();
+
 #ifdef EXEC_BACKEND
 
 	/*
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 9bd5d8d1508..159af4f4bdc 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -290,6 +290,14 @@ typedef	struct ZenithFeedback
 	TimestampTz ps_replytime;
 } ZenithFeedback;
 
+
+typedef struct ZenithFeedbackState
+{
+	slock_t		mutex;
+	ZenithFeedback feedback;
+
+} ZenithFeedbackState;
+
 /*
  * Report safekeeper state to proposer
  */
@@ -380,6 +388,10 @@ void ParseZenithFeedbackMessage(StringInfo reply_message,
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 
+Size ZenithFeedbackShmemSize(void);
+bool ZenithFeedbackShmemInit(void);
+void zenith_feedback_set(ZenithFeedback *zf);
+void zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
 /* libpqwalproposer hooks & helper type */
 

From 1872ba6cef7cd73d3b1e7169945b47c572c27f51 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Wed, 23 Feb 2022 18:50:31 +0300
Subject: [PATCH 115/167] Fix zenith_test_utils linkage on macOS

Use function pointer to perform a cross-extension calls.
---
 contrib/zenith_test_utils/Makefile     |  3 +++
 contrib/zenith_test_utils/zenithtest.c | 25 ++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith_test_utils/Makefile b/contrib/zenith_test_utils/Makefile
index 9203f2349d3..5b2fcdc18fe 100644
--- a/contrib/zenith_test_utils/Makefile
+++ b/contrib/zenith_test_utils/Makefile
@@ -10,11 +10,14 @@ EXTENSION = zenith_test_utils
 DATA = zenith_test_utils--1.0.sql
 PGFILEDESC = "zenith_test_utils - helpers for zenith testing and debugging"
 
+EXTRA_INSTALL=contrib/zenith
+
 ifdef USE_PGXS
 PG_CONFIG = pg_config
 PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
+PG_CPPFLAGS = -I$(top_srcdir)/contrib
 subdir = contrib/zenith_test_utils
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index bd867755e61..c1e2c1c92f4 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -22,22 +22,41 @@
 #include "utils/builtins.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
-
+#include "zenith/pagestore_client.h"
 
 PG_MODULE_MAGIC;
 
+extern void _PG_init(void);
+
 PG_FUNCTION_INFO_V1(test_consume_xids);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 
 /*
- * This function is defined in the zenith extension, such declaration is fragile.
+ * Linkage to functions in zenith module.
  * The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
  */
-extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
+typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			XLogRecPtr request_lsn, bool request_latest, char *buffer);
 
+static zenith_read_at_lsn_type zenith_read_at_lsn_ptr;
+
+/*
+ * Module initialize function: fetch function pointers for cross-module calls.
+ */
+void
+_PG_init(void)
+{
+	/* Asserts verify that typedefs above match original declarations */
+	AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
+	zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
+		load_external_function("$libdir/zenith", "zenith_read_at_lsn",
+							   true, NULL);
+}
+
+#define zenith_read_at_lsn zenith_read_at_lsn_ptr
+
 /*
  * test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
  */

From 31dc24ab29e6bdd5cfb85920a9c728f759c01b29 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 21 Feb 2022 15:07:26 +0300
Subject: [PATCH 116/167] Add warning fr unrecgonized GUCs with zenith prefix

refer #1262
---
 contrib/zenith/libpagestore.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index d3e20fc6411..d8ec3eba81d 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -429,6 +429,7 @@ _PG_init(void)
 							NULL, NULL,	NULL);
 
 	relsize_hash_init();
+	EmitWarningsOnPlaceholders("zenith");
 
 	if (page_server != NULL)
 		zenith_log(ERROR, "libpqpagestore already loaded");

From ce7ff2d3b731307f3abb5c9bed6b3f401875e530 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 8 Mar 2022 16:01:03 +0300
Subject: [PATCH 117/167] Count WAL flushes in walreceiver (#139)

---
 src/backend/replication/walreceiver.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index 4831a259c48..9b3f01207f8 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -494,6 +494,13 @@ WalReceiverMain(void)
 				if (endofwal)
 					break;
 
+				/*
+				 * Update WAL statistics, which are produced inside
+				 * issue_xlog_fsync function. This is useful for counting
+				 * WAL flushes, by querying pg_stat_wal.
+				 */
+				pgstat_send_wal(true);
+
 				/*
 				 * Ideally we would reuse a WaitEventSet object repeatedly
 				 * here to avoid the overheads of WaitLatchOrSocket on epoll

From 093aa160e5df19814ff19b995d36dd5ee03c7f8b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 Dec 2021 20:23:51 +0300
Subject: [PATCH 118/167] Use local relation cache for smgr_exists

refer  #1077
---
 contrib/zenith/pagestore_smgr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 7f3dc0bb09b..5ab935a4e0b 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -618,6 +618,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 {
 	bool		exists;
 	ZenithResponse *resp;
+	BlockNumber n_blocks;
 	bool		latest;
 	XLogRecPtr	request_lsn;
 
@@ -644,6 +645,11 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
+	if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
+	{
+		return true;
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {
@@ -748,6 +754,9 @@ zenith_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 	 * exist.
 	 */
 	mdunlink(rnode, forkNum, isRedo);
+	if (!RelFileNodeBackendIsTemp(rnode)) {
+		forget_cached_relsize(rnode.node, forkNum);
+	}
 }
 
 /*

From 9a2d4ca1ada54dcb13d7e18fe0710de987b87f1f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 10 Mar 2022 16:05:05 +0200
Subject: [PATCH 119/167] Populate relsize cache when relation is created.

Postgres can perform an smgrnblocks() call on the relation right after
creating it, and we don't update the last-written LSN on smgrcreate().

Perhaps we should update last-written LSN, instead. This isn't
bulletproof.
---
 contrib/zenith/pagestore_smgr.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 5ab935a4e0b..afda2bd4767 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -720,6 +720,18 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 reln->smgr_rnode.node.relNode,
 		 forkNum);
 
+	/*
+	 * Newly created relation is empty, remember that in the relsize cache.
+	 *
+	 * FIXME: This is currently not just an optimization, but required for
+	 * correctness. Postgres can call smgrnblocks() on the newly-created
+	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
+	 * relation created, so if we didn't remember the size in the relsize
+	 * cache, we might call smgrnblocks() on the newly-created relation before
+	 * the creation WAL record hass been received by the page server.
+	 */
+	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdcreate(reln, forkNum, isRedo);

From 756a01aade765d1d2ac115e7e189865ff697222b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 28 Mar 2022 09:38:46 +0300
Subject: [PATCH 120/167] Fix pg_table_size() on a view

---
 contrib/zenith/pagestore_smgr.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index afda2bd4767..18c55fa5cdc 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -650,6 +650,23 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return true;
 	}
 
+	/*
+	 * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
+	 * will error out if you check that, because the whole dbdir for tablespace
+	 * 0, db 0 doesn't exists. We possibly should change the page server to
+	 * accept that and return 'false', to be consistent with mdexists(). But
+	 * we probably also should fix pg_table_size() to not call smgrexists()
+	 * with bogus relfilenode.
+	 *
+	 * For now, handle that special case here.
+	 */
+	if (reln->smgr_rnode.node.spcNode == 0 &&
+		reln->smgr_rnode.node.dbNode == 0 &&
+		reln->smgr_rnode.node.relNode == 0)
+	{
+		return false;
+	}
+
 	request_lsn = zenith_get_request_lsn(&latest);
 	{
 		ZenithExistsRequest request = {

From 5c278ed0aca5dea9340d9af4ad5f004d905ff1b7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 28 Mar 2022 18:10:05 +0400
Subject: [PATCH 121/167] Don't set commitLsn to truncateLsn.

It might jump back (on compute) this way, which is not fatal but violates sanity
checks.
---
 src/backend/replication/walproposer.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 7752ae965d3..dfc4a538918 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -914,8 +914,6 @@ RecvAcceptorGreeting(Safekeeper *sk)
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
-	sk->appendResponse.flushLsn = truncateLsn;
-	sk->appendResponse.hs.ts = 0;
 
 	++n_connected;
 	if (n_connected <= quorum)

From 1dc6535b877050a3b3768ca1ed140dd15a1a3e78 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 31 Mar 2022 18:46:43 +0300
Subject: [PATCH 122/167] Raise fatal error on failed recovery (#147)

---
 src/backend/replication/walproposer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index dfc4a538918..4d5092e94eb 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1257,10 +1257,14 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 					break;
 			}
 		}
-		ereport(DEBUG1,
+		ereport(LOG,
 				(errmsg("end of replication stream at %X/%X: %m",
 						LSN_FORMAT_ARGS(rec_end_lsn))));
 		walrcv_disconnect(wrconn);
+
+		/* failed to receive all WAL till endpos */
+		if (rec_end_lsn < endpos)
+			return false;
 	}
 	else
 	{

From a260728984e27137130681ab24fa83a4f4815c4e Mon Sep 17 00:00:00 2001
From: Anton Shyrabokau <97127717+antons-antons@users.noreply.github.com>
Date: Fri, 1 Apr 2022 12:44:28 -0700
Subject: [PATCH 123/167] Enable dumping corrupt WAL segments (#145)

* Enable dumping corrupt WAL segments

 Add ability to dump WAL segment with corrupt page headers and recrods
 skips over missing/broken page headers
 skips over misformatted log recrods
 allows dumping log record from a particular file starting from an
optional offset
 (without a need of carefully crafted input)
---
 src/backend/access/transam/xlogreader.c | 117 ++++++++++----
 src/bin/pg_waldump/pg_waldump.c         | 194 ++++++++++++++++++++++--
 src/include/access/xlogreader.h         |   5 +
 3 files changed, 276 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index d797d9d5087..9f52c6fca18 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -239,7 +239,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt,
 void
 XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
 {
-	Assert(!XLogRecPtrIsInvalid(RecPtr));
+	Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks);
 
 	ResetDecoder(state);
 
@@ -279,6 +279,14 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	bool		gotheader;
 	int			readOff;
 
+#define SKIP_INVALID_RECORD(rec_ptr)  do { \
+										rec_ptr = MAXALIGN(rec_ptr + 1); \
+										if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \
+											goto restart; \
+										else \
+											goto skip_invalid; \
+									} while (0);
+
 	/*
 	 * randAccess indicates whether to verify the previous-record pointer of
 	 * the record we're reading.  We only do this if we're reading
@@ -315,7 +323,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 		 * In this case, EndRecPtr should already be pointing to a valid
 		 * record starting position.
 		 */
-		Assert(XRecOffIsValid(RecPtr));
+		Assert(XRecOffIsValid(RecPtr) || state->skip_lsn_checks);
 		randAccess = true;
 	}
 
@@ -351,17 +359,23 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	}
 	else if (targetRecOff < pageHeaderSize)
 	{
-		report_invalid_record(state, "invalid record offset at %X/%X",
+		if(!state->skip_page_validation)
+		{
+			report_invalid_record(state, "invalid record offset at %X/%X",
 							  LSN_FORMAT_ARGS(RecPtr));
-		goto err;
+			goto err;
+		}
 	}
 
 	if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
 		targetRecOff == pageHeaderSize)
 	{
-		report_invalid_record(state, "contrecord is requested by %X/%X",
+		if(!state->skip_page_validation)
+		{
+			report_invalid_record(state, "contrecord is requested by %X/%X",
 							  LSN_FORMAT_ARGS(RecPtr));
-		goto err;
+			goto err;
+		}
 	}
 
 	/* ReadPageInternal has verified the page header */
@@ -376,6 +390,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	 * cannot access any other fields until we've verified that we got the
 	 * whole header.
 	 */
+skip_invalid:
 	record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
 	total_len = record->xl_tot_len;
 
@@ -391,7 +406,13 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 	{
 		if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record,
 								   randAccess))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
+
 		gotheader = true;
 	}
 	else
@@ -399,12 +420,19 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 		/* XXX: more validation should be done here */
 		if (total_len < SizeOfXLogRecord)
 		{
-			report_invalid_record(state,
-								  "invalid record length at %X/%X: wanted %u, got %u",
-								  LSN_FORMAT_ARGS(RecPtr),
-								  (uint32) SizeOfXLogRecord, total_len);
-			goto err;
+			if(!state->skip_invalid_records)
+			{
+				report_invalid_record(state,
+						"invalid record length at %X/%X: wanted %u, got %u",
+						LSN_FORMAT_ARGS(RecPtr),
+						(uint32) SizeOfXLogRecord, total_len);
+
+				goto err;
+			}
+
+			SKIP_INVALID_RECORD(RecPtr);
 		}
+
 		gotheader = false;
 	}
 
@@ -425,10 +453,16 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 		if (total_len > state->readRecordBufSize &&
 			!allocate_recordbuf(state, total_len))
 		{
-			/* We treat this as a "bogus data" condition */
-			report_invalid_record(state, "record length %u at %X/%X too long",
-								  total_len, LSN_FORMAT_ARGS(RecPtr));
-			goto err;
+
+			if(!state->skip_invalid_records)
+			{
+				/* We treat this as a "bogus data" condition */
+				report_invalid_record(state, "record length %u at %X/%X too long",
+									total_len, LSN_FORMAT_ARGS(RecPtr));
+				goto err;
+			}
+
+			SKIP_INVALID_RECORD(RecPtr);
 		}
 
 		/* Copy the first fragment of the record from the first page. */
@@ -473,10 +507,15 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 			/* Check that the continuation on next page looks valid */
 			if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
 			{
-				report_invalid_record(state,
+				if(!state->skip_invalid_records)
+				{
+					report_invalid_record(state,
 									  "there is no contrecord flag at %X/%X",
 									  LSN_FORMAT_ARGS(RecPtr));
-				goto err;
+					goto err;
+				}
+
+				SKIP_INVALID_RECORD(RecPtr);
 			}
 
 			/*
@@ -486,12 +525,17 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 			if (pageHeader->xlp_rem_len == 0 ||
 				total_len != (pageHeader->xlp_rem_len + gotlen))
 			{
-				report_invalid_record(state,
+				if(!state->skip_invalid_records)
+				{
+					report_invalid_record(state,
 									  "invalid contrecord length %u (expected %lld) at %X/%X",
 									  pageHeader->xlp_rem_len,
 									  ((long long) total_len) - gotlen,
 									  LSN_FORMAT_ARGS(RecPtr));
-				goto err;
+					goto err;
+				}
+
+				SKIP_INVALID_RECORD(RecPtr);
 			}
 
 			/* Append the continuation from this page to the buffer */
@@ -522,7 +566,13 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 				record = (XLogRecord *) state->readRecordBuf;
 				if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr,
 										   record, randAccess))
-					goto err;
+				{
+					if(!state->skip_invalid_records)
+						goto err;
+
+					SKIP_INVALID_RECORD(RecPtr);
+				}
+
 				gotheader = true;
 			}
 		} while (gotlen < total_len);
@@ -531,7 +581,12 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 
 		record = (XLogRecord *) state->readRecordBuf;
 		if (!ValidXLogRecord(state, record, RecPtr))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
 
 		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
 		state->ReadRecPtr = RecPtr;
@@ -548,7 +603,12 @@ XLogReadRecord(XLogReaderState *state, char **errormsg)
 
 		/* Record does not cross a page boundary */
 		if (!ValidXLogRecord(state, record, RecPtr))
-			goto err;
+		{
+			if(!state->skip_invalid_records)
+				goto err;
+
+			SKIP_INVALID_RECORD(RecPtr);
+		}
 
 		state->EndRecPtr = RecPtr + MAXALIGN(total_len);
 
@@ -652,8 +712,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
 		/* we can be sure to have enough WAL available, we scrolled back */
 		Assert(readLen == XLOG_BLCKSZ);
 
-		if (!XLogReaderValidatePageHeader(state, targetSegmentPtr,
-										  state->readBuf))
+		if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, state->readBuf) && !state->skip_page_validation)
 			goto err;
 	}
 
@@ -690,7 +749,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen)
 	/*
 	 * Now that we know we have the full header, validate it.
 	 */
-	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
+	if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) && !state->skip_page_validation)
 		goto err;
 
 	/* update read state information */
@@ -748,7 +807,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		 * We can't exactly verify the prev-link, but surely it should be less
 		 * than the record's own address.
 		 */
-		if (!(record->xl_prev < RecPtr))
+		if (!(record->xl_prev < RecPtr)  && !state->skip_lsn_checks)
 		{
 			report_invalid_record(state,
 								  "record with incorrect prev-link %X/%X at %X/%X",
@@ -764,7 +823,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		 * check guards against torn WAL pages where a stale but valid-looking
 		 * WAL record starts on a sector boundary.
 		 */
-		if (record->xl_prev != PrevRecPtr)
+		if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks)
 		{
 			report_invalid_record(state,
 								  "record with incorrect prev-link %X/%X at %X/%X",
@@ -907,7 +966,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 	 * check typically fails when an old WAL segment is recycled, and hasn't
 	 * yet been overwritten with new data yet.
 	 */
-	if (hdr->xlp_pageaddr != recaddr)
+	if (hdr->xlp_pageaddr != recaddr && !state->skip_lsn_checks)
 	{
 		char		fname[MAXFNAMELEN];
 
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index f8b8afe4a7b..786da4be3ab 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -13,9 +13,11 @@
 #include "postgres.h"
 
 #include <dirent.h>
+#include <limits.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
+
 #include "access/transam.h"
 #include "access/xlog_internal.h"
 #include "access/xlogreader.h"
@@ -23,8 +25,11 @@
 #include "common/fe_memutils.h"
 #include "common/logging.h"
 #include "getopt_long.h"
+#include "port/pg_bitutils.h"
 #include "rmgrdesc.h"
 
+#define OFFSET_INVALID ((size_t)-1)
+
 static const char *progname;
 
 static int	WalSegSz;
@@ -35,6 +40,7 @@ typedef struct XLogDumpPrivate
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
 	bool		endptr_reached;
+	char* 		input_filename;
 } XLogDumpPrivate;
 
 typedef struct XLogDumpConfig
@@ -52,6 +58,7 @@ typedef struct XLogDumpConfig
 	int			filter_by_rmgr;
 	TransactionId filter_by_xid;
 	bool		filter_by_xid_enabled;
+	bool		ignore_format_errors;
 } XLogDumpConfig;
 
 typedef struct Stats
@@ -70,8 +77,36 @@ typedef struct XLogDumpStats
 	Stats		record_stats[RM_NEXT_ID][MAX_XLINFO_TYPES];
 } XLogDumpStats;
 
+
 #define fatal_error(...) do { pg_log_fatal(__VA_ARGS__); exit(EXIT_FAILURE); } while(0)
 
+/* calculate ceil(log base 2) of num */
+static int
+my_log2(long num)
+{
+	/*
+	 * guard against too-large input, which would be invalid for
+	 * pg_ceil_log2_*()
+	 */
+	if (num > LONG_MAX / 2)
+		num = LONG_MAX / 2;
+
+#if SIZEOF_LONG < 8
+	return pg_ceil_log2_32(num);
+#else
+	return pg_ceil_log2_64(num);
+#endif
+}
+
+/* calculate first power of 2 >= num, bounded to what will fit in an int */
+static int
+next_pow2_int(long num)
+{
+	if (num > INT_MAX / 2)
+		num = INT_MAX / 2;
+	return 1 << my_log2(num);
+}
+
 static void
 print_rmgr_list(void)
 {
@@ -287,6 +322,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo,
 	TimeLineID	tli = *tli_p;
 	char		fname[MAXPGPATH];
 	int			tries;
+	XLogDumpPrivate *private = state->private_data;
+
+	if(private->input_filename)
+	{
+		Assert(nextSegNo == 0);
+
+		state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename);
+		if (state->seg.ws_file >= 0)
+			return;
+
+		fatal_error("could not open file \"%s\": %m", private->input_filename);
+	}
 
 	XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
 
@@ -357,6 +404,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
 	{
 		WALOpenSegment *seg = &errinfo.wre_seg;
 		char		fname[MAXPGPATH];
+		char		*actual_fname = private->input_filename ? private->input_filename : fname;
 
 		XLogFileName(fname, seg->ws_tli, seg->ws_segno,
 					 state->segcxt.ws_segsize);
@@ -365,11 +413,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
 		{
 			errno = errinfo.wre_errno;
 			fatal_error("could not read from file %s, offset %u: %m",
-						fname, errinfo.wre_off);
+						actual_fname, errinfo.wre_off);
 		}
 		else
 			fatal_error("could not read from file %s, offset %u: read %d of %zu",
-						fname, errinfo.wre_off, errinfo.wre_read,
+						actual_fname, errinfo.wre_off, errinfo.wre_read,
 						(Size) errinfo.wre_req);
 	}
 
@@ -468,16 +516,25 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record)
 	int			block_id;
 	uint8		info = XLogRecGetInfo(record);
 	XLogRecPtr	xl_prev = XLogRecGetPrev(record);
+	XLogDumpPrivate *private = record->private_data;
 	StringInfoData s;
 
 	XLogDumpRecordLen(record, &rec_len, &fpi_len);
 
-	printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
+	if(private->input_filename)
+		printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: 0x%lX, prev %X/%08X, ",
 		   desc->rm_name,
 		   rec_len, XLogRecGetTotalLen(record),
 		   XLogRecGetXid(record),
-		   LSN_FORMAT_ARGS(record->ReadRecPtr),
+		   record->ReadRecPtr,
 		   LSN_FORMAT_ARGS(xl_prev));
+	else
+		printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ",
+			desc->rm_name,
+			rec_len, XLogRecGetTotalLen(record),
+			XLogRecGetXid(record),
+			LSN_FORMAT_ARGS(record->ReadRecPtr),
+			LSN_FORMAT_ARGS(xl_prev));
 
 	id = desc->rm_identify(info);
 	if (id == NULL)
@@ -729,7 +786,10 @@ usage(void)
 	printf(_("  -b, --bkp-details      output detailed information about backup blocks\n"));
 	printf(_("  -e, --end=RECPTR       stop reading at WAL location RECPTR\n"));
 	printf(_("  -f, --follow           keep retrying after reaching end of WAL\n"));
+	printf(_("  -F, --file=FNAME       dump log records from a single file\n"));
+	printf(_("  -i, --ignore           ignore format errors, skip invalid structures\n"));
 	printf(_("  -n, --limit=N          number of records to display\n"));
+	printf(_("  -o, --offset=OFFSET    offset of the first record to in a file to dump\n"));
 	printf(_("  -p, --path=PATH        directory in which to find log segment files or a\n"
 			 "                         directory with a ./pg_wal that contains such files\n"
 			 "                         (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n"));
@@ -760,14 +820,20 @@ main(int argc, char **argv)
 	XLogRecord *record;
 	XLogRecPtr	first_record;
 	char	   *waldir = NULL;
+	char	   *fname = NULL;
 	char	   *errormsg;
+	bool 		single_file = false;
+	size_t		start_offset = OFFSET_INVALID;
 
 	static struct option long_options[] = {
 		{"bkp-details", no_argument, NULL, 'b'},
 		{"end", required_argument, NULL, 'e'},
 		{"follow", no_argument, NULL, 'f'},
+		{"file", required_argument, NULL, 'F'},
 		{"help", no_argument, NULL, '?'},
+		{"ignore", no_argument, NULL, 'i'},
 		{"limit", required_argument, NULL, 'n'},
+		{"offset", required_argument, NULL, 'o'},
 		{"path", required_argument, NULL, 'p'},
 		{"quiet", no_argument, NULL, 'q'},
 		{"rmgr", required_argument, NULL, 'r'},
@@ -808,6 +874,7 @@ main(int argc, char **argv)
 	private.startptr = InvalidXLogRecPtr;
 	private.endptr = InvalidXLogRecPtr;
 	private.endptr_reached = false;
+	private.input_filename = NULL;
 
 	config.quiet = false;
 	config.bkp_details = false;
@@ -819,6 +886,7 @@ main(int argc, char **argv)
 	config.filter_by_xid_enabled = false;
 	config.stats = false;
 	config.stats_per_record = false;
+	config.ignore_format_errors = false;
 
 	if (argc <= 1)
 	{
@@ -826,7 +894,7 @@ main(int argc, char **argv)
 		goto bad_argument;
 	}
 
-	while ((option = getopt_long(argc, argv, "be:fn:p:qr:s:t:x:z",
+	while ((option = getopt_long(argc, argv, "be:fF:in:o:p:qr:s:t:x:z",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -846,6 +914,13 @@ main(int argc, char **argv)
 			case 'f':
 				config.follow = true;
 				break;
+			case 'F':
+				fname = pg_strdup(optarg);
+				single_file = true;
+				break;
+			case 'i':
+				config.ignore_format_errors = true;
+				break;
 			case 'n':
 				if (sscanf(optarg, "%d", &config.stop_after_records) != 1)
 				{
@@ -853,6 +928,13 @@ main(int argc, char **argv)
 					goto bad_argument;
 				}
 				break;
+			case 'o':
+				if (sscanf(optarg, "%zu", &start_offset) != 1)
+				{
+					pg_log_error("could not parse offset \"%s\"", optarg);
+					goto bad_argument;
+				}
+				break;
 			case 'p':
 				waldir = pg_strdup(optarg);
 				break;
@@ -939,6 +1021,73 @@ main(int argc, char **argv)
 		goto bad_argument;
 	}
 
+	if (start_offset != OFFSET_INVALID)
+	{
+		if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr))
+		{
+			pg_log_error("either file offset or start/end pointers should be specified");
+			goto bad_argument;
+		}
+
+		if(!single_file)
+		{
+			pg_log_error("offset option could only be used with filename option");
+			goto bad_argument;
+		}
+
+		/* Log records are maxaligned, start at the closest next position */
+		private.startptr = MAXALIGN(start_offset);
+	}
+
+	if(single_file)
+	{
+		char	   *directory = NULL;
+		int			fd;
+		struct stat stat;
+
+		if(config.follow)
+		{
+			pg_log_error("Follow could not be used in file dump mode");
+			goto bad_argument;
+		}
+
+		if (waldir != NULL)
+		{
+			pg_log_error("either single file or wal directory should be specified");
+			goto bad_argument;
+		}
+
+		split_path(fname, &directory, &private.input_filename);
+		waldir = directory;
+
+		if(waldir == NULL)
+		{
+			char *cwd = malloc(MAXPGPATH);
+
+			if (!getcwd(cwd, MAXPGPATH))
+				fatal_error("could identify current directory: %m");
+
+			waldir = cwd;
+		}
+
+		if (!verify_directory(waldir))
+			fatal_error("could not open directory \"%s\": %m", waldir);
+
+		fd = open_file_in_directory(waldir, private.input_filename);
+		if (fd < 0)
+			fatal_error("could not open file \"%s\"", private.input_filename);
+
+		if(fstat(fd, &stat) != 0)
+			fatal_error("could not stat file \"%s\"", private.input_filename);
+
+		private.endptr = stat.st_size;
+
+		/* Round up segment size to next power of 2 or 1MB */
+		WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024);
+
+		close(fd);
+	}
+
 	if (waldir != NULL)
 	{
 		/* validate path points to directory */
@@ -957,6 +1106,12 @@ main(int argc, char **argv)
 		int			fd;
 		XLogSegNo	segno;
 
+		if(single_file)
+		{
+			pg_log_error("either single file or start/end boundaries should be specified");
+			goto bad_argument;
+		}
+
 		split_path(argv[optind], &directory, &fname);
 
 		if (waldir == NULL && directory != NULL)
@@ -1029,10 +1184,11 @@ main(int argc, char **argv)
 		}
 	}
 	else
-		waldir = identify_target_directory(waldir, NULL);
+		if (!single_file)
+			waldir = identify_target_directory(waldir, NULL);
 
 	/* we don't know what to print */
-	if (XLogRecPtrIsInvalid(private.startptr))
+	if (XLogRecPtrIsInvalid(private.startptr) && !single_file)
 	{
 		pg_log_error("no start WAL location given");
 		goto bad_argument;
@@ -1050,12 +1206,28 @@ main(int argc, char **argv)
 	if (!xlogreader_state)
 		fatal_error("out of memory");
 
-	/* first find a valid recptr to start from */
-	first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+	if(single_file)
+	{
+		if(config.ignore_format_errors)
+		{
+			xlogreader_state->skip_page_validation = true;
+			xlogreader_state->skip_invalid_records = true;
+		}
+
+		xlogreader_state->skip_lsn_checks = true;
 
-	if (first_record == InvalidXLogRecPtr)
-		fatal_error("could not find a valid record after %X/%X",
+		first_record = private.startptr;
+		XLogBeginRead(xlogreader_state, first_record);
+	}
+	else
+	{
+		/* first find a valid recptr to start from */
+		first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+
+		if (first_record == InvalidXLogRecPtr)
+			fatal_error("could not find a valid record after %X/%X",
 					LSN_FORMAT_ARGS(private.startptr));
+	}
 
 	/*
 	 * Display a message that we're skipping data if `from` wasn't a pointer
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 10458c23eda..c7fac7bdace 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -262,6 +262,11 @@ struct XLogReaderState
 	XLogRecPtr	missingContrecPtr;
 	/* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */
 	XLogRecPtr	overwrittenRecPtr;
+
+	/* Disable validation to allow dumpng corrupt WAL */
+	bool skip_page_validation;
+	bool skip_invalid_records;
+	bool skip_lsn_checks;
 };
 
 /* Get a new XLogReader */

From 848145999653be213141a330569b6f2d9f53dbf2 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 4 Apr 2022 15:52:29 +0300
Subject: [PATCH 124/167] Don't hold walproposer WAL in memory (#141)

WAL is no longer in memory to prevent OOM in the compute. Removed in-memory queue because it's not needed anymore. When streaming, WAL is now read directly from disk. Every safekeeper has a separate XLogReader. walproposer will now read as much WAL as it can for a single AppendRequest message, it can help with recovering lagging safekeepers. Because Recovery needs to save WAL for streaming, now walproposer can write WAL to disk and `--sync-safekeepers` mode will create pg_wal directory if needed. Replication slot `restart_lsn` is now synced with `truncate_lsn` to prevent truncation of disk WAL until needed.
---
 src/backend/replication/walproposer.c       | 318 ++++++++------------
 src/backend/replication/walproposer_utils.c | 106 +++++++
 src/backend/replication/walsender.c         | 123 ++++----
 src/include/replication/walproposer.h       |  52 ++--
 4 files changed, 323 insertions(+), 276 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4d5092e94eb..4843b10e1d9 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -37,7 +37,9 @@
 
 #include <signal.h>
 #include <unistd.h>
+#include <sys/stat.h>
 #include "access/xlogdefs.h"
+#include "access/xlogutils.h"
 #include "replication/walproposer.h"
 #include "storage/latch.h"
 #include "miscadmin.h"
@@ -73,12 +75,8 @@ WalProposerFunctionsType *WalProposerFunctions = NULL;
 static int	n_safekeepers = 0;
 static int	quorum = 0;
 static Safekeeper safekeeper[MAX_SAFEKEEPERS];
-static WalMessage *msgQueueHead;
-static WalMessage *msgQueueTail;
-static XLogRecPtr lastSentLsn;	/* WAL has been appended to msg queue up to
-								 * this point */
-static XLogRecPtr lastSentCommitLsn;	/* last commitLsn broadcast to
-										 * safekeepers */
+static XLogRecPtr availableLsn;	/* WAL has been generated up to this point */
+static XLogRecPtr lastSentCommitLsn; /* last commitLsn broadcast to safekeepers */
 static ProposerGreeting greetRequest;
 static VoteRequest voteRequest; /* Vote request for safekeeper */
 static WaitEventSet *waitEvents;
@@ -134,10 +132,8 @@ static bool WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr start
 static void SendProposerElected(Safekeeper *sk);
 static void WalProposerStartStreaming(XLogRecPtr startpos);
 static void StartStreaming(Safekeeper *sk);
-static void SendMessageToNode(Safekeeper *sk, WalMessage *msg);
-static void BroadcastMessage(WalMessage *msg);
-static WalMessage * CreateMessage(XLogRecPtr startpos, char *data, int len);
-static WalMessage * CreateMessageCommitLsnOnly(XLogRecPtr lsn);
+static void SendMessageToNode(Safekeeper *sk);
+static void BroadcastAppendRequest(void);
 static void HandleActiveState(Safekeeper *sk, uint32 events);
 static bool SendAppendRequests(Safekeeper *sk);
 static bool RecvAppendResponses(Safekeeper *sk);
@@ -198,7 +194,10 @@ WalProposerMain(Datum main_arg)
 void
 WalProposerSync(int argc, char *argv[])
 {
+	struct stat stat_buf;
+
 	syncSafekeepers = true;
+	ThisTimeLineID = 1;
 
 	InitStandaloneProcess(argv[0]);
 
@@ -233,6 +232,22 @@ WalProposerSync(int argc, char *argv[])
 				(errcode_for_socket_access(),
 				 errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
 
+	ChangeToDataDir();
+
+	/* Create pg_wal directory, if it doesn't exist */
+	if (stat(XLOGDIR, &stat_buf) != 0)
+	{
+		ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
+		if (MakePGDirectory(XLOGDIR) < 0)
+		{
+			ereport(ERROR,
+					(errcode_for_file_access(),
+						errmsg("could not create directory \"%s\": %m",
+							XLOGDIR)));
+			exit(1);
+		}
+	}
+
 	WalProposerInit(0, 0);
 
 	process_shared_preload_libraries_in_progress = false;
@@ -247,12 +262,11 @@ WalProposerSync(int argc, char *argv[])
  * called from walsender every time the new WAL is available.
  */
 void
-WalProposerBroadcast(XLogRecPtr startpos, char *data, int len)
+WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos)
 {
-	WalMessage *msg = CreateMessage(startpos, data, len);
-
-	if (msg != NULL)
-		BroadcastMessage(msg);
+	Assert(startpos == availableLsn && endpos >= availableLsn);
+	availableLsn = endpos;
+	BroadcastAppendRequest();
 }
 
 /*
@@ -303,9 +317,9 @@ WalProposerPoll(void)
 			 * If no WAL was generated during timeout (and we have already
 			 * collected the quorum), then send pool message
 			 */
-			if (lastSentLsn != InvalidXLogRecPtr)
+			if (availableLsn != InvalidXLogRecPtr)
 			{
-				BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+				BroadcastAppendRequest();
 			}
 		}
 	}
@@ -379,9 +393,12 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 		 */
 		safekeeper[n_safekeepers].conninfo[0] = '\0';
 		initStringInfo(&safekeeper[n_safekeepers].outbuf);
+		safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open, .segment_close = wal_segment_close), NULL);
+		if (safekeeper[n_safekeepers].xlogreader == NULL)
+			elog(FATAL, "Failed to allocate xlog reader");
 		safekeeper[n_safekeepers].flushWrite = false;
-		safekeeper[n_safekeepers].currMsg = NULL;
 		safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
+		safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
 		n_safekeepers += 1;
 	}
 	if (n_safekeepers < 1)
@@ -513,7 +530,7 @@ ShutdownConnection(Safekeeper *sk)
 	sk->conn = NULL;
 	sk->state = SS_OFFLINE;
 	sk->flushWrite = false;
-	sk->currMsg = NULL;
+	sk->streamingAt = InvalidXLogRecPtr;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
@@ -1087,13 +1104,13 @@ HandleElectedProposer(void)
 	if (syncSafekeepers)
 	{
 		/*
-			* Queue empty message to enforce receiving feedback
-			* even from nodes who are fully recovered; this is
-			* required to learn they switched epoch which finishes
-			* sync-safeekepers who doesn't generate any real new
-			* records. Will go away once we switch to async acks.
-			*/
-		BroadcastMessage(CreateMessageCommitLsnOnly(propEpochStartLsn));
+		 * Send empty message to enforce receiving feedback
+		 * even from nodes who are fully recovered; this is
+		 * required to learn they switched epoch which finishes
+		 * sync-safeekepers who doesn't generate any real new
+		 * records. Will go away once we switch to async acks.
+		 */
+		BroadcastAppendRequest();
 
 		/* keep polling until all safekeepers are synced */
 		return;
@@ -1172,6 +1189,12 @@ DetermineEpochStartLsn(void)
 	Assert((truncateLsn != InvalidXLogRecPtr) ||
 		   (syncSafekeepers && truncateLsn == propEpochStartLsn));
 
+	/*
+	 * We will be generating WAL since propEpochStartLsn, so we should set
+	 * availableLsn to mark this LSN as the latest available position.
+	 */
+	availableLsn = propEpochStartLsn;
+
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
@@ -1249,7 +1272,10 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 					   sizeof rec_start_lsn);
 				rec_start_lsn = pg_ntoh64(rec_start_lsn);
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
-				(void) CreateMessage(rec_start_lsn, buf, len);
+
+				/* write WAL to disk */
+				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
+
 				ereport(DEBUG1,
 						(errmsg("Recover message %X/%X length %d",
 								LSN_FORMAT_ARGS(rec_start_lsn), len)));
@@ -1374,7 +1400,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}
 
-	Assert(msgQueueHead == NULL || sk->startStreamingAt >= msgQueueHead->req.beginLsn);
+	Assert(sk->startStreamingAt >= truncateLsn && sk->startStreamingAt <= availableLsn);
 
 	msg.tag = 'e';
 	msg.term = propTerm;
@@ -1426,39 +1452,29 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 StartStreaming(Safekeeper *sk)
 {
-	WalMessage *startMsg = msgQueueHead;
-
 	/* 
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
 	 * exactly once for a connection.
 	 */
 	sk->state = SS_ACTIVE;
-
-	while (startMsg != NULL && startMsg->req.endLsn <= sk->startStreamingAt)
-		startMsg = startMsg->next;
-
-	/* We should always have WAL to start from sk->startStreamingAt */
-	Assert(startMsg == NULL || startMsg->req.beginLsn <= sk->startStreamingAt);
+	sk->streamingAt = sk->startStreamingAt;
 
 	/* event set will be updated inside SendMessageToNode */
-	SendMessageToNode(sk, startMsg);
+	SendMessageToNode(sk);
 }
 
 /*
- * Start sending message to the particular node. Always updates event set.
+ * Try to send message to the particular node. Always updates event set. Will
+ * send at least one message, if socket is ready.
  *
  * Can be used only for safekeepers in SS_ACTIVE state. State can be changed
  * in case of errors.
  */
 static void
-SendMessageToNode(Safekeeper *sk, WalMessage *msg)
+SendMessageToNode(Safekeeper *sk)
 {
-	/* we shouldn't be already sending something */
-	Assert(sk->currMsg == NULL);
 	Assert(sk->state == SS_ACTIVE);
 
-	sk->currMsg = msg;
-
 	/* Note: we always send everything to the safekeeper until WOULDBLOCK or nothing left to send */
 	HandleActiveState(sk, WL_SOCKET_WRITEABLE);
 }
@@ -1467,95 +1483,25 @@ SendMessageToNode(Safekeeper *sk, WalMessage *msg)
  * Broadcast new message to all caught-up safekeepers
  */
 static void
-BroadcastMessage(WalMessage *msg)
+BroadcastAppendRequest()
 {
 	for (int i = 0; i < n_safekeepers; i++)
-	{
-		if (safekeeper[i].state == SS_ACTIVE && safekeeper[i].currMsg == NULL)
-		{
-			SendMessageToNode(&safekeeper[i], msg);
-		}
-	}
-}
-
-static WalMessage *
-CreateMessage(XLogRecPtr startpos, char *data, int len)
-{
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
-	XLogRecPtr	endpos;
-
-	len -= XLOG_HDR_SIZE;
-	endpos = startpos + len;
-	if (msgQueueTail && msgQueueTail->req.endLsn >= endpos)
-	{
-		/* Message already queued */
-		return NULL;
-	}
-	Assert(len >= 0);
-	msg = (WalMessage *) malloc(sizeof(WalMessage) + len);
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
-
-	msg->size = sizeof(AppendRequestHeader) + len;
-	msg->next = NULL;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
-	msg->req.beginLsn = startpos;
-	msg->req.endLsn = endpos;
-	msg->req.proposerId = greetRequest.proposerId;
-	memcpy(&msg->req + 1, data + XLOG_HDR_SIZE, len);
-
-	Assert(msg->req.endLsn >= lastSentLsn);
-	lastSentLsn = msg->req.endLsn;
-	return msg;
+		if (safekeeper[i].state == SS_ACTIVE)
+			SendMessageToNode(&safekeeper[i]);
 }
 
-/*
- * Create WAL message with no data, just to let the safekeepers
- * know that commit lsn has advanced.
- */
-static WalMessage *
-CreateMessageCommitLsnOnly(XLogRecPtr lsn)
+static void
+PrepareAppendRequest(AppendRequestHeader *req, XLogRecPtr beginLsn, XLogRecPtr endLsn)
 {
-	/* Create new message and append it to message queue */
-	WalMessage *msg;
-
-	msg = (WalMessage *) malloc(sizeof(WalMessage));
-	if (msgQueueTail != NULL)
-		msgQueueTail->next = msg;
-	else
-		msgQueueHead = msg;
-	msgQueueTail = msg;
-
-	msg->size = sizeof(AppendRequestHeader);
-	msg->next = NULL;
-	msg->req.tag = 'a';
-	msg->req.term = propTerm;
-	msg->req.epochStartLsn = propEpochStartLsn;
-
-	/*
-	 * This serves two purposes: 1) After all msgs from previous epochs are
-	 * pushed we queue empty WalMessage with lsn set to epochStartLsn which
-	 * commands to switch the epoch, which allows to do the switch without
-	 * creating new epoch records (we especially want to avoid such in --sync
-	 * mode). Walproposer can advance commit_lsn only after the switch, so
-	 * this lsn (reported back) also is the first possible advancement point.
-	 * 2) Maintain common invariant of queue entries sorted by LSN.
-	 */
-	msg->req.beginLsn = lsn;
-	msg->req.endLsn = lsn;
-	msg->req.proposerId = greetRequest.proposerId;
-
-	/*
-	 * truncateLsn and commitLsn are set just before the message sent, in
-	 * SendAppendRequests()
-	 */
-	return msg;
+	Assert(endLsn >= beginLsn);
+	req->tag = 'a';
+	req->term = propTerm;
+	req->epochStartLsn = propEpochStartLsn;
+	req->beginLsn = beginLsn;
+	req->endLsn = endLsn;
+	req->commitLsn = GetAcknowledgedByQuorumWALPosition();
+	req->truncateLsn = truncateLsn;
+	req->proposerId = greetRequest.proposerId;
 }
 
 /*
@@ -1578,20 +1524,22 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
 	 * in the buffer.
 	 * 
-	 * sk->currMsg checks if we have pending unsent messages. This check isn't
-	 * necessary now, because we always send queue messages immediately after
-	 * creation. But it's good to have it here in case we change this behavior
+	 * LSN comparison checks if we have pending unsent messages. This check isn't
+	 * necessary now, because we always send append messages immediately after
+	 * arrival. But it's good to have it here in case we change this behavior
 	 * in the future.
 	 */
-	if (sk->currMsg != NULL || sk->flushWrite)
+	if (sk->streamingAt != availableLsn || sk->flushWrite)
 		newEvents |= WL_SOCKET_WRITEABLE;
 
 	UpdateEventSet(sk, newEvents);
 }
 
 /*
- * Send queue messages starting from sk->currMsg until the end or non-writable
+ * Send WAL messages starting from sk->streamingAt until the end or non-writable
  * socket, whichever comes first. Caller should take care of updating event set.
+ * Even if no unsent WAL is available, at least one empty message will be sent 
+ * as a heartbeat, if socket is ready.
  * 
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
@@ -1599,9 +1547,11 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 static bool
 SendAppendRequests(Safekeeper *sk)
 {
-	WalMessage *msg;
+	XLogRecPtr endLsn;
 	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
+	WALReadError errinfo;
+	bool sentAnything = false;
 
 	if (sk->flushWrite)
 	{
@@ -1616,37 +1566,21 @@ SendAppendRequests(Safekeeper *sk)
 		sk->flushWrite = false;
 	}
 
-	while (sk->currMsg)
+	while (sk->streamingAt != availableLsn || !sentAnything)
 	{
-		msg = sk->currMsg;
-		req = &msg->req;
+		sentAnything = true;
 
-		req->commitLsn = GetAcknowledgedByQuorumWALPosition();
-		req->truncateLsn = truncateLsn;
+		endLsn = sk->streamingAt;
+		endLsn += MAX_SEND_SIZE;
 
-		/*
-		 * If we need to send this message not from the beginning,
-		 * form the cut version. Only happens for the first
-		 * message.
-		 */
-		if (sk->startStreamingAt > msg->req.beginLsn)
-		{
-			uint32		len;
-			uint32		size;
-
-			Assert(sk->startStreamingAt < req->endLsn);
-
-			len = msg->req.endLsn - sk->startStreamingAt;
-			size = sizeof(AppendRequestHeader) + len;
-			req = malloc(size);
-			*req = msg->req;
-			req->beginLsn = sk->startStreamingAt;
-			memcpy(req + 1,
-					(char *) (&msg->req + 1) + sk->startStreamingAt -
-					msg->req.beginLsn,
-					len);
+		/* if we went beyond available WAL, back off */
+		if (endLsn > availableLsn) {
+			endLsn = availableLsn;
 		}
 
+		req = &sk->appendRequest;
+		PrepareAppendRequest(&sk->appendRequest, sk->streamingAt, endLsn);
+
 		ereport(DEBUG2,
 				(errmsg("sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
 						req->endLsn - req->beginLsn,
@@ -1655,19 +1589,28 @@ SendAppendRequests(Safekeeper *sk)
 						LSN_FORMAT_ARGS(req->commitLsn),
 						LSN_FORMAT_ARGS(truncateLsn), sk->host, sk->port)));
 
-		/*
-		 * We write with msg->size here because the body of the
-		 * message is stored after the end of the WalMessage
-		 * struct, in the allocation for each msg
-		 */
-		writeResult = walprop_async_write(sk->conn, req, sizeof(AppendRequestHeader) + req->endLsn - req->beginLsn);
-		
-		/* Free up resources */
-		if (req != &msg->req)
-			free(req);
+		resetStringInfo(&sk->outbuf);
 
+		/* write AppendRequest header */
+		appendBinaryStringInfo(&sk->outbuf, (char*) req, sizeof(AppendRequestHeader));
+
+		/* write the WAL itself */
+		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+		if (!WALRead(sk->xlogreader,
+				 &sk->outbuf.data[sk->outbuf.len],
+				 req->beginLsn,
+				 req->endLsn - req->beginLsn,
+				 ThisTimeLineID,
+				 &errinfo))
+		{
+			WALReadRaiseError(&errinfo);
+		}
+		sk->outbuf.len += req->endLsn - req->beginLsn;
+
+		writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
+		
 		/* Mark current message as sent, whatever the result is */
-		sk->currMsg = sk->currMsg->next;
+		sk->streamingAt = endLsn;
 
 		switch (writeResult)
 		{
@@ -1723,6 +1666,13 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
+		ereport(DEBUG2,
+				(errmsg("received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+						sk->appendResponse.term,
+						LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+						LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+						sk->host, sk->port)));
+
 		readAnything = true;
 	}
 
@@ -1733,14 +1683,11 @@ RecvAppendResponses(Safekeeper *sk)
 
 	/*
 	 * Also send the new commit lsn to all the safekeepers.
-	 *
-	 * FIXME: This is redundant for safekeepers that have other
-	 * outbound messages pending.
 	 */
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
 	if (minQuorumLsn > lastSentCommitLsn)
 	{
-		BroadcastMessage(CreateMessageCommitLsnOnly(lastSentLsn));
+		BroadcastAppendRequest();
 		lastSentCommitLsn = minQuorumLsn;
 	}
 
@@ -2033,25 +1980,16 @@ HandleSafekeeperResponse(void)
 	 */
 	minFlushLsn = CalculateMinFlushLsn();
 	if (minFlushLsn > truncateLsn)
-		truncateLsn = minFlushLsn;
-
-	/*
-	 * Cleanup message queue up to truncateLsn. These messages were processed
-	 * by all safekeepers because they all reported flushLsn greater than endLsn.
-	 */
-	while (msgQueueHead != NULL && msgQueueHead->req.endLsn < truncateLsn)
 	{
-		WalMessage *msg = msgQueueHead;
-		msgQueueHead = msg->next;
+		truncateLsn = minFlushLsn;
 
-		memset(msg, 0xDF, sizeof(WalMessage) + msg->size - sizeof(AppendRequestHeader));
-		free(msg);
+		/*
+		 * Advance the replication slot to free up old WAL files. Note
+		 * that slot doesn't exist if we are in syncSafekeepers mode.
+		 */
+		if (MyReplicationSlot)
+			PhysicalConfirmReceivedLocation(truncateLsn);
 	}
-	if (!msgQueueHead)			/* queue is empty */
-		msgQueueTail = NULL;
-
-	/* truncateLsn always points to the first chunk in the queue */
-	Assert(msgQueueHead == NULL || (truncateLsn >= msgQueueHead->req.beginLsn && truncateLsn <= msgQueueHead->req.endLsn));
 
 	/*
 	 * Generally sync is done when majority switched the epoch so we committed
diff --git a/src/backend/replication/walproposer_utils.c b/src/backend/replication/walproposer_utils.c
index 7a593a71778..c9ddafdee0c 100644
--- a/src/backend/replication/walproposer_utils.c
+++ b/src/backend/replication/walproposer_utils.c
@@ -8,6 +8,15 @@
 #include <netinet/tcp.h>
 #include <unistd.h>
 
+/*
+ * These variables are used similarly to openLogFile/SegNo,
+ * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
+ * corresponding the filename of walpropFile.
+ */
+static int	walpropFile = -1;
+static TimeLineID walpropFileTLI = 0;
+static XLogSegNo walpropSegNo = 0;
+
 int
 CompareLsn(const void *a, const void *b)
 {
@@ -294,3 +303,100 @@ pq_sendint64_le(StringInfo buf, uint64 i)
 	memcpy(buf->data + buf->len, &i, sizeof(uint64));
 	buf->len += sizeof(uint64);
 }
+
+/*
+ * Write XLOG data to disk.
+ */
+void
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
+{
+	int			startoff;
+	int			byteswritten;
+
+	while (nbytes > 0)
+	{
+		int			segbytes;
+
+		/* Close the current segment if it's completed */
+		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+			XLogWalPropClose(recptr);
+
+		if (walpropFile < 0)
+		{
+			bool		use_existent = true;
+
+			/* Create/use new log file */
+			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
+			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
+			walpropFileTLI = ThisTimeLineID;
+		}
+
+		/* Calculate the start offset of the received logs */
+		startoff = XLogSegmentOffset(recptr, wal_segment_size);
+
+		if (startoff + nbytes > wal_segment_size)
+			segbytes = wal_segment_size - startoff;
+		else
+			segbytes = nbytes;
+
+		/* OK to write the logs */
+		errno = 0;
+
+		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
+		if (byteswritten <= 0)
+		{
+			char		xlogfname[MAXFNAMELEN];
+			int			save_errno;
+
+			/* if write didn't set errno, assume no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
+
+			save_errno = errno;
+			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+			errno = save_errno;
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not write to log segment %s "
+							"at offset %u, length %lu: %m",
+							xlogfname, startoff, (unsigned long) segbytes)));
+		}
+
+		/* Update state for write */
+		recptr += byteswritten;
+
+		nbytes -= byteswritten;
+		buf += byteswritten;
+	}
+
+	/*
+	 * Close the current segment if it's fully written up in the last cycle of
+	 * the loop.
+	 */
+	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
+	{
+		XLogWalPropClose(recptr);
+	}
+}
+
+/*
+ * Close the current segment.
+ */
+void
+XLogWalPropClose(XLogRecPtr recptr)
+{
+	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
+
+	if (close(walpropFile) != 0)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
+
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not close log segment %s: %m",
+						xlogfname)));
+	}
+
+	walpropFile = -1;
+}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f649302ba9e..e46870e70f8 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1864,7 +1864,7 @@ ProcessStandbyMessage(void)
 /*
  * Remember that a walreceiver just confirmed receipt of lsn `lsn`.
  */
-static void
+void
 PhysicalConfirmReceivedLocation(XLogRecPtr lsn)
 {
 	bool		changed = false;
@@ -2030,6 +2030,13 @@ ProcessStandbyReply(XLogRecPtr	writePtr,
 	if (!am_cascading_walsender)
 		SyncRepReleaseWaiters();
 
+	/* 
+	 * walproposer use trunclateLsn instead of flushPtr for confirmed
+	 * received location, so we shouldn't update restart_lsn here.
+	 */
+	if (am_wal_proposer)
+		return;
+
 	/*
 	 * Advance our local xmin horizon when the client confirmed a flush.
 	 */
@@ -2830,73 +2837,73 @@ XLogSendPhysical(void)
 	nbytes = endptr - startptr;
 	Assert(nbytes <= MAX_SEND_SIZE);
 
-	/*
-	 * OK to read and send the slice.
-	 */
-	if (output_message.data)
-		resetStringInfo(&output_message);
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, endptr);
+	}
 	else
-		initStringInfo(&output_message);
-
-	pq_sendbyte(&output_message, 'w');
-	pq_sendint64(&output_message, startptr);	/* dataStart */
-	pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
-	pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
-
-	/*
-	 * Read the log directly into the output buffer to avoid extra memcpy
-	 * calls.
-	 */
-	enlargeStringInfo(&output_message, nbytes);
+	{
+		/*
+		* OK to read and send the slice.
+		*/
+		if (output_message.data)
+			resetStringInfo(&output_message);
+		else
+			initStringInfo(&output_message);
 
-retry:
-	if (!WALRead(xlogreader,
-				 &output_message.data[output_message.len],
-				 startptr,
-				 nbytes,
-				 xlogreader->seg.ws_tli,	/* Pass the current TLI because
-											 * only WalSndSegmentOpen controls
-											 * whether new TLI is needed. */
-				 &errinfo))
-		WALReadRaiseError(&errinfo);
+		pq_sendbyte(&output_message, 'w');
+		pq_sendint64(&output_message, startptr);	/* dataStart */
+		pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
+		pq_sendint64(&output_message, 0);	/* sendtime, filled in last */
 
-	/* See logical_read_xlog_page(). */
-	XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
-	CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
+		/*
+		* Read the log directly into the output buffer to avoid extra memcpy
+		* calls.
+		*/
+		enlargeStringInfo(&output_message, nbytes);
+
+	retry:
+		if (!WALRead(xlogreader,
+					&output_message.data[output_message.len],
+					startptr,
+					nbytes,
+					xlogreader->seg.ws_tli,	/* Pass the current TLI because
+												* only WalSndSegmentOpen controls
+												* whether new TLI is needed. */
+					&errinfo))
+			WALReadRaiseError(&errinfo);
+
+		/* See logical_read_xlog_page(). */
+		XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
+		CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
 
-	/*
-	 * During recovery, the currently-open WAL file might be replaced with the
-	 * file of the same name retrieved from archive. So we always need to
-	 * check what we read was valid after reading into the buffer. If it's
-	 * invalid, we try to open and read the file again.
-	 */
-	if (am_cascading_walsender)
-	{
-		WalSnd	   *walsnd = MyWalSnd;
-		bool		reload;
+		/*
+		* During recovery, the currently-open WAL file might be replaced with the
+		* file of the same name retrieved from archive. So we always need to
+		* check what we read was valid after reading into the buffer. If it's
+		* invalid, we try to open and read the file again.
+		*/
+		if (am_cascading_walsender)
+		{
+			WalSnd	   *walsnd = MyWalSnd;
+			bool		reload;
 
-		SpinLockAcquire(&walsnd->mutex);
-		reload = walsnd->needreload;
-		walsnd->needreload = false;
-		SpinLockRelease(&walsnd->mutex);
+			SpinLockAcquire(&walsnd->mutex);
+			reload = walsnd->needreload;
+			walsnd->needreload = false;
+			SpinLockRelease(&walsnd->mutex);
 
-		if (reload && xlogreader->seg.ws_file >= 0)
-		{
-			wal_segment_close(xlogreader);
+			if (reload && xlogreader->seg.ws_file >= 0)
+			{
+				wal_segment_close(xlogreader);
 
-			goto retry;
+				goto retry;
+			}
 		}
-	}
 
-	output_message.len += nbytes;
-	output_message.data[output_message.len] = '\0';
+		output_message.len += nbytes;
+		output_message.data[output_message.len] = '\0';
 
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, output_message.data, output_message.len);
-	}
-	else
-	{
 		/*
 		 * Fill the send timestamp last, so that it is taken as late as possible.
 		 */
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 159af4f4bdc..538dcf6c5b6 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -14,6 +14,7 @@
 #define SK_PROTOCOL_VERSION   1
 
 #define MAX_SAFEKEEPERS        32
+#define MAX_SEND_SIZE         (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
 #define XLOG_HDR_SIZE         (1+8*3)  /* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS    1        /* offset of start position in wal sender message header */
 #define XLOG_HDR_END_POS      (1+8)    /* offset of end position in wal sender message header */
@@ -251,23 +252,6 @@ typedef struct AppendRequestHeader
     pg_uuid_t  proposerId; /* for monitoring/debugging */
 } AppendRequestHeader;
 
-/*
- * All copy data message ('w') are linked in L1 send list and asynchronously sent to receivers.
- * When message is sent to all receivers, it is removed from send list.
- */
-struct WalMessage
-{
-	WalMessage* next;      /* L1 list of messages */
-	uint32 size;           /* message size */
-	AppendRequestHeader req; /* request to safekeeper (message header) */
-
-	/* PHANTOM FIELD:
-	 *
-	 * All WalMessages are allocated with exactly (size - sizeof(AppendRequestHeader)) additional bytes
-	 * after them, containing the body of the message. This allocation is done in `CreateMessage`
-	 * (for body len > 0) and `CreateMessageVCLOnly` (for body len == 0). */
-};
-
 /*
  * Hot standby feedback received from replica
  */
@@ -342,20 +326,29 @@ typedef struct Safekeeper
 	 * reach SS_ACTIVE; not before.
 	 */
 	WalProposerConn*   conn;
+	/*
+	 * Temporary buffer for the message being sent to the safekeeper.
+	 */
 	StringInfoData outbuf;
+	/*
+	 * WAL reader, allocated for each safekeeper.
+	 */
+	XLogReaderState* xlogreader;
 
-	bool               flushWrite;    /* set to true if we need to call AsyncFlush, to flush pending messages */
-	WalMessage*        currMsg;       /* message that wasn't sent yet or NULL, if we have nothing to send */
-
-	int                eventPos;      /* position in wait event set. Equal to -1 if no event */
-	SafekeeperState     state;         /* safekeeper state machine state */
-	AcceptorGreeting   greetResponse;         /* acceptor greeting  */
-	VoteResponse	   voteResponse;  /* the vote */
-	AppendResponse appendResponse;		  /* feedback to master */
 	/*
 	 * Streaming will start here; must be record boundary.
 	 */
 	XLogRecPtr startStreamingAt;
+
+	bool                flushWrite;     /* set to true if we need to call AsyncFlush, to flush pending messages */
+	XLogRecPtr          streamingAt;    /* current streaming position */
+	AppendRequestHeader appendRequest;  /* request for sending to safekeeper */
+
+	int                 eventPos;       /* position in wait event set. Equal to -1 if no event */
+	SafekeeperState     state;          /* safekeeper state machine state */
+	AcceptorGreeting    greetResponse;  /* acceptor greeting */
+	VoteResponse        voteResponse;   /* the vote */
+	AppendResponse      appendResponse; /* feedback for master */
 } Safekeeper;
 
 
@@ -365,19 +358,22 @@ void       AssertEventsOkForState(uint32 events, Safekeeper* sk);
 uint32     SafekeeperStateDesiredEvents(SafekeeperState state);
 char*      FormatEvents(uint32 events);
 void       WalProposerMain(Datum main_arg);
-void       WalProposerBroadcast(XLogRecPtr startpos, char* data, int len);
+void       WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
 bool       HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32     pq_getmsgint32_le(StringInfo msg);
 uint64     pq_getmsgint64_le(StringInfo msg);
-void	   pq_sendint32_le(StringInfo buf, uint32 i);
-void	   pq_sendint64_le(StringInfo buf, uint64 i);
+void       pq_sendint32_le(StringInfo buf, uint32 i);
+void       pq_sendint64_le(StringInfo buf, uint64 i);
 void       WalProposerPoll(void);
 void       WalProposerRegister(void);
+void       XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
+void       XLogWalPropClose(XLogRecPtr recptr);
 void       ProcessStandbyReply(XLogRecPtr	writePtr,
 							   XLogRecPtr	flushPtr,
 							   XLogRecPtr	applyPtr,
 							   TimestampTz replyTime,
 							   bool		replyRequested);
+void       PhysicalConfirmReceivedLocation(XLogRecPtr lsn);
 void       ProcessStandbyHSFeedback(TimestampTz   replyTime,
 									TransactionId feedbackXmin,
 									uint32		feedbackEpoch,

From 841376a0ee5d33ab4779dc886d103a7a345a7f21 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 27 Jan 2022 16:51:17 +0300
Subject: [PATCH 125/167] Add --sysid parameter to initdb

---
 src/backend/access/transam/xlog.c | 15 +++++++++++----
 src/backend/bootstrap/bootstrap.c | 13 ++++++++++++-
 src/bin/initdb/initdb.c           |  4 ++++
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a475b2ad622..55f14ec032f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -111,6 +111,7 @@ int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 bool		track_wal_io_timing = false;
+uint64      predefined_sysidentifier;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -5311,10 +5312,16 @@ BootStrapXLOG(void)
 	 * perhaps be useful sometimes.
 	 */
 	gettimeofday(&tv, NULL);
-	sysidentifier = ((uint64) tv.tv_sec) << 32;
-	sysidentifier |= ((uint64) tv.tv_usec) << 12;
-	sysidentifier |= getpid() & 0xFFF;
-
+	if (predefined_sysidentifier != 0)
+	{
+		sysidentifier = predefined_sysidentifier;
+	}
+	else
+	{
+		sysidentifier = ((uint64) tv.tv_sec) << 32;
+		sysidentifier |= ((uint64) tv.tv_usec) << 12;
+		sysidentifier |= getpid() & 0xFFF;
+	}
 	/* First timeline ID is always 1 */
 	ThisTimeLineID = 1;
 
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 94ab5ca0954..97546f34e9a 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -53,6 +53,7 @@
 
 uint32		bootstrap_data_checksum_version = 0;	/* No checksum */
 
+extern uint64 predefined_sysidentifier;
 
 static void CheckerModeMain(void);
 static void BootstrapModeMain(void);
@@ -225,7 +226,7 @@ AuxiliaryProcessMain(int argc, char *argv[])
 	/* If no -x argument, we are a CheckerProcess */
 	MyAuxProcType = CheckerProcess;
 
-	while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:x:X:-:")) != -1)
+	while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:x:X:-:")) != -1)
 	{
 		switch (flag)
 		{
@@ -272,6 +273,16 @@ AuxiliaryProcessMain(int argc, char *argv[])
 									PGC_S_OVERRIDE);
 				}
 				break;
+			case 's':
+			{
+				char* endptr;
+#ifdef HAVE_STRTOULL
+				predefined_sysidentifier = strtoull(optarg, &endptr, 10);
+#else
+				predefined_sysidentifier = strtoul(optarg, &endptr, 10);
+#endif
+				break;
+			}
 			case 'c':
 			case '-':
 				{
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 77e621a7679..6e09e22062c 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2948,6 +2948,7 @@ main(int argc, char *argv[])
 		{"data-checksums", no_argument, NULL, 'k'},
 		{"allow-group-access", no_argument, NULL, 'g'},
 		{"discard-caches", no_argument, NULL, 14},
+		{"sysid", required_argument, NULL, 15},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -3094,6 +3095,9 @@ main(int argc, char *argv[])
 										 extra_options,
 										 "-c debug_discard_caches=1");
 				break;
+			case 15:
+				boot_options = psprintf("%s -s %s", boot_options, optarg);
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),

From d1be792b7ba7ea84216da2fdacd813cad5e8374e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 27 Jan 2022 17:52:45 +0300
Subject: [PATCH 126/167] Add utilities for recovery database fmro WAL

---
 contrib/zenith/utils/restore_from_wal.sh         | 13 +++++++++++++
 contrib/zenith/utils/restore_from_wal_archive.sh | 13 +++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100755 contrib/zenith/utils/restore_from_wal.sh
 create mode 100755 contrib/zenith/utils/restore_from_wal_archive.sh

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
new file mode 100755
index 00000000000..0c14cbf8216
--- /dev/null
+++ b/contrib/zenith/utils/restore_from_wal.sh
@@ -0,0 +1,13 @@
+WAL_PATH=$1
+SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
+rm -fr pgsql.0
+env -i /home/knizhnik/zenith.main/tmp_install/bin/initdb -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
+pg_ctl -D pgsql.0 -l logfile start
+pg_ctl -D pgsql.0 -l logfile stop -m immediate
+cp pgsql.0/pg_wal/000000010000000000000001 .
+cp $WAL_PATH/* pgsql.0/pg_wal/
+(cd pgsql.0/pg_wal ; for partial in *.partial ; do mv $partial `basename $partial .partial`; done)
+dd if=000000010000000000000001 of=pgsql.0/pg_wal/000000010000000000000001 bs=6924704 count=1 conv=notrunc
+rm 000000010000000000000001
+rm -f logfile 000000010000000000000001
+pg_ctl -D pgsql.0 -l logfile start
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
new file mode 100755
index 00000000000..f84ffdadc48
--- /dev/null
+++ b/contrib/zenith/utils/restore_from_wal_archive.sh
@@ -0,0 +1,13 @@
+WAL_PATH=$1
+SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
+rm -fr pgsql.0 /tmp/pg_wals
+mkdir /tmp/pg_wals
+env -i /home/knizhnik/zenith.main/tmp_install/bin/initdb -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
+cp $WAL_PATH/* /tmp/pg_wals
+(cd /tmp/pg_wals ; for partial in *.partial ; do mv $partial `basename $partial .partial`; done)
+dd if=pgsql.0/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=6924704 count=1 conv=notrunc
+echo > pgsql.0/recovery.signal
+rm -f pgsql.0/pg_wal/*
+echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> pgsql.0/postgresql.conf
+rm -f logfile
+pg_ctl -D pgsql.0 -l logfile start

From 12f4e4e9e689b2cfcaed7c6cf2fb06cfe7a2a08f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 28 Jan 2022 10:30:17 +0300
Subject: [PATCH 127/167] Calculate wal position in restore_from_wal scripts

refer #1169
---
 contrib/zenith/utils/restore_from_wal.sh         | 5 +++--
 contrib/zenith/utils/restore_from_wal_archive.sh | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
index 0c14cbf8216..5d82509d28c 100755
--- a/contrib/zenith/utils/restore_from_wal.sh
+++ b/contrib/zenith/utils/restore_from_wal.sh
@@ -2,12 +2,13 @@ WAL_PATH=$1
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr pgsql.0
 env -i /home/knizhnik/zenith.main/tmp_install/bin/initdb -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
+REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
+declare -i WAL_SIZE=$REDO_POS+114
 pg_ctl -D pgsql.0 -l logfile start
 pg_ctl -D pgsql.0 -l logfile stop -m immediate
 cp pgsql.0/pg_wal/000000010000000000000001 .
 cp $WAL_PATH/* pgsql.0/pg_wal/
 (cd pgsql.0/pg_wal ; for partial in *.partial ; do mv $partial `basename $partial .partial`; done)
-dd if=000000010000000000000001 of=pgsql.0/pg_wal/000000010000000000000001 bs=6924704 count=1 conv=notrunc
-rm 000000010000000000000001
+dd if=000000010000000000000001 of=pgsql.0/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 rm -f logfile 000000010000000000000001
 pg_ctl -D pgsql.0 -l logfile start
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
index f84ffdadc48..6e96b334c53 100755
--- a/contrib/zenith/utils/restore_from_wal_archive.sh
+++ b/contrib/zenith/utils/restore_from_wal_archive.sh
@@ -3,9 +3,11 @@ SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr pgsql.0 /tmp/pg_wals
 mkdir /tmp/pg_wals
 env -i /home/knizhnik/zenith.main/tmp_install/bin/initdb -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
+REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
+declare -i WAL_SIZE=$REDO_POS+114
 cp $WAL_PATH/* /tmp/pg_wals
 (cd /tmp/pg_wals ; for partial in *.partial ; do mv $partial `basename $partial .partial`; done)
-dd if=pgsql.0/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=6924704 count=1 conv=notrunc
+dd if=pgsql.0/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 echo > pgsql.0/recovery.signal
 rm -f pgsql.0/pg_wal/*
 echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> pgsql.0/postgresql.conf

From 8a364c1acf6a977605841e884381c434df7befa8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 14 Mar 2022 19:20:11 +0300
Subject: [PATCH 128/167] Remove absolute pathes from restore_from_wal scripts

---
 contrib/zenith/utils/restore_from_wal.sh         | 3 ++-
 contrib/zenith/utils/restore_from_wal_archive.sh | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
index 5d82509d28c..47a8f91c972 100755
--- a/contrib/zenith/utils/restore_from_wal.sh
+++ b/contrib/zenith/utils/restore_from_wal.sh
@@ -1,7 +1,8 @@
 WAL_PATH=$1
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr pgsql.0
-env -i /home/knizhnik/zenith.main/tmp_install/bin/initdb -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
+INITDB=`type initdb`
+env -i $INITDB -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
 REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
 pg_ctl -D pgsql.0 -l logfile start
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
index 6e96b334c53..4822a0a95d1 100755
--- a/contrib/zenith/utils/restore_from_wal_archive.sh
+++ b/contrib/zenith/utils/restore_from_wal_archive.sh
@@ -2,7 +2,8 @@ WAL_PATH=$1
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr pgsql.0 /tmp/pg_wals
 mkdir /tmp/pg_wals
-env -i /home/knizhnik/zenith.main/tmp_install/bin/initdb -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
+INITDB=`type initdb`
+env -i $INITDB -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
 REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
 cp $WAL_PATH/* /tmp/pg_wals

From 5b3852fd1cb711060c40f102fe9c28c6579eb528 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 14 Mar 2022 20:14:58 +0300
Subject: [PATCH 129/167] Remove absolute pathes from restore_from_wal scripts

---
 contrib/zenith/utils/restore_from_wal.sh         | 3 ++-
 contrib/zenith/utils/restore_from_wal_archive.sh | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
index 47a8f91c972..abe64b62d23 100755
--- a/contrib/zenith/utils/restore_from_wal.sh
+++ b/contrib/zenith/utils/restore_from_wal.sh
@@ -1,7 +1,8 @@
 WAL_PATH=$1
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr pgsql.0
-INITDB=`type initdb`
+INITDB=`type -p initdb`
+echo $INITDB
 env -i $INITDB -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
 REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
index 4822a0a95d1..833d2771048 100755
--- a/contrib/zenith/utils/restore_from_wal_archive.sh
+++ b/contrib/zenith/utils/restore_from_wal_archive.sh
@@ -2,7 +2,7 @@ WAL_PATH=$1
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr pgsql.0 /tmp/pg_wals
 mkdir /tmp/pg_wals
-INITDB=`type initdb`
+INITDB=`type -p initdb`
 env -i $INITDB -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
 REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114

From 4ffb3bd85e1c7fc381c01af9c2e1749a477fcfce Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 15 Mar 2022 14:11:00 +0300
Subject: [PATCH 130/167] Update scripts for recovery from WAL

---
 contrib/zenith/utils/restore_from_wal.sh      | 31 ++++++++++---------
 .../zenith/utils/restore_from_wal_archive.sh  | 28 ++++++++++-------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
index abe64b62d23..af0d49dad54 100755
--- a/contrib/zenith/utils/restore_from_wal.sh
+++ b/contrib/zenith/utils/restore_from_wal.sh
@@ -1,16 +1,19 @@
-WAL_PATH=$1
+PG_BIN=$1
+WAL_PATH=$2
+OUTPUT_DIR=$3
+DATA_DIR=$OUTPUT_DIR/pgdata-vanilla
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr pgsql.0
-INITDB=`type -p initdb`
-echo $INITDB
-env -i $INITDB -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
-REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
+rm -fr $DATA_DIR
+env -i $PG_BIN/initdb -E utf8 -D $DATA_DIR --sysid=$SYSID
+REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
-pg_ctl -D pgsql.0 -l logfile start
-pg_ctl -D pgsql.0 -l logfile stop -m immediate
-cp pgsql.0/pg_wal/000000010000000000000001 .
-cp $WAL_PATH/* pgsql.0/pg_wal/
-(cd pgsql.0/pg_wal ; for partial in *.partial ; do mv $partial `basename $partial .partial`; done)
-dd if=000000010000000000000001 of=pgsql.0/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-rm -f logfile 000000010000000000000001
-pg_ctl -D pgsql.0 -l logfile start
+$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
+$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
+cp $DATA_DIR/pg_wal/000000010000000000000001 .
+cp $WAL_PATH/* $DATA_DIR/pg_wal/
+if [ -f $DATA_DIR/pg_wal/*.partial ]
+then
+	(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
+fi
+dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+rm -f 000000010000000000000001
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
index 833d2771048..fd0a3ed4c02 100755
--- a/contrib/zenith/utils/restore_from_wal_archive.sh
+++ b/contrib/zenith/utils/restore_from_wal_archive.sh
@@ -1,16 +1,20 @@
-WAL_PATH=$1
+PG_BIN=$1
+WAL_PATH=$2
+OUTPUT_DIR=$3
+DATA_DIR=$OUTPUT_DIR/pgdata-vanilla
+echo WAL_PATh=$WAL_PATH OUTPUT_DIR=$OUTPUT_DIR DATA_DIR=$DATA_DIR
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr pgsql.0 /tmp/pg_wals
+rm -fr $DATA_DIR /tmp/pg_wals
 mkdir /tmp/pg_wals
-INITDB=`type -p initdb`
-env -i $INITDB -E utf8 -U zenith_admin -D pgsql.0 --sysid=$SYSID
-REDO_POS=0x`pg_controldata -D pgsql.0 | fgrep "REDO location"| cut -c 42-`
+env -i $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID
+REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
 cp $WAL_PATH/* /tmp/pg_wals
-(cd /tmp/pg_wals ; for partial in *.partial ; do mv $partial `basename $partial .partial`; done)
-dd if=pgsql.0/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-echo > pgsql.0/recovery.signal
-rm -f pgsql.0/pg_wal/*
-echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> pgsql.0/postgresql.conf
-rm -f logfile
-pg_ctl -D pgsql.0 -l logfile start
+if [ -f $DATA_DIR/pg_wal/*.partial ]
+then
+	(cd /tmp/pg_wals ; for partial in \*.partial ; do  mv $partial `basename $partial .partial` ; done)
+fi
+dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+echo > $DATA_DIR/recovery.signal
+rm -f $DATA_DIR/pg_wal/*
+echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf

From cc12d19a79e7814be3128b2e321fa98eb113e73a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 15 Mar 2022 18:05:54 +0300
Subject: [PATCH 131/167] Make it possible to specify port and datadir for
 restore_from_wal

---
 contrib/zenith/utils/restore_from_wal.sh         | 5 +++--
 contrib/zenith/utils/restore_from_wal_archive.sh | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
index af0d49dad54..21eb2656d95 100755
--- a/contrib/zenith/utils/restore_from_wal.sh
+++ b/contrib/zenith/utils/restore_from_wal.sh
@@ -1,10 +1,11 @@
 PG_BIN=$1
 WAL_PATH=$2
-OUTPUT_DIR=$3
-DATA_DIR=$OUTPUT_DIR/pgdata-vanilla
+DATA_DIR=$3
+PORT=$4
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr $DATA_DIR
 env -i $PG_BIN/initdb -E utf8 -D $DATA_DIR --sysid=$SYSID
+echo port=$PORT >> $DATA_DIR/postgresql.conf
 REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
 $PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
index fd0a3ed4c02..0d9dbb5a8ce 100755
--- a/contrib/zenith/utils/restore_from_wal_archive.sh
+++ b/contrib/zenith/utils/restore_from_wal_archive.sh
@@ -1,12 +1,12 @@
 PG_BIN=$1
 WAL_PATH=$2
-OUTPUT_DIR=$3
-DATA_DIR=$OUTPUT_DIR/pgdata-vanilla
-echo WAL_PATh=$WAL_PATH OUTPUT_DIR=$OUTPUT_DIR DATA_DIR=$DATA_DIR
+DATA_DIR=$3
+PORT=$4
 SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
 rm -fr $DATA_DIR /tmp/pg_wals
 mkdir /tmp/pg_wals
 env -i $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID
+echo port=$PORT >> $DATA_DIR/postgresql.conf
 REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
 declare -i WAL_SIZE=$REDO_POS+114
 cp $WAL_PATH/* /tmp/pg_wals

From 61afbf978b17764134ab6f1650bbdcadac147e71 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 16 Mar 2022 10:14:06 +0300
Subject: [PATCH 132/167] Move recovery scripts to main repo

---
 contrib/zenith/utils/restore_from_wal.sh      | 20 -------------------
 .../zenith/utils/restore_from_wal_archive.sh  | 20 -------------------
 2 files changed, 40 deletions(-)
 delete mode 100755 contrib/zenith/utils/restore_from_wal.sh
 delete mode 100755 contrib/zenith/utils/restore_from_wal_archive.sh

diff --git a/contrib/zenith/utils/restore_from_wal.sh b/contrib/zenith/utils/restore_from_wal.sh
deleted file mode 100755
index 21eb2656d95..00000000000
--- a/contrib/zenith/utils/restore_from_wal.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-PG_BIN=$1
-WAL_PATH=$2
-DATA_DIR=$3
-PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR
-env -i $PG_BIN/initdb -E utf8 -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
-declare -i WAL_SIZE=$REDO_POS+114
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
-cp $DATA_DIR/pg_wal/000000010000000000000001 .
-cp $WAL_PATH/* $DATA_DIR/pg_wal/
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
-fi
-dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-rm -f 000000010000000000000001
diff --git a/contrib/zenith/utils/restore_from_wal_archive.sh b/contrib/zenith/utils/restore_from_wal_archive.sh
deleted file mode 100755
index 0d9dbb5a8ce..00000000000
--- a/contrib/zenith/utils/restore_from_wal_archive.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-PG_BIN=$1
-WAL_PATH=$2
-DATA_DIR=$3
-PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR /tmp/pg_wals
-mkdir /tmp/pg_wals
-env -i $PG_BIN/initdb -E utf8 -U zenith_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
-declare -i WAL_SIZE=$REDO_POS+114
-cp $WAL_PATH/* /tmp/pg_wals
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd /tmp/pg_wals ; for partial in \*.partial ; do  mv $partial `basename $partial .partial` ; done)
-fi
-dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-echo > $DATA_DIR/recovery.signal
-rm -f $DATA_DIR/pg_wal/*
-echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf

From d7c8426e49cff3c791c3f2c4cde95f1fce665573 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 15 Apr 2022 18:11:05 +0400
Subject: [PATCH 133/167] Give up connection attempt to safekeeper after
 timeout.

Enforces reconnection soon when packets are dropped, e.g. after turning ec2
instance off.

ref https://github.com/neondatabase/neon/issues/1491
---
 src/backend/replication/walproposer.c | 27 +++++++++++++++++++++++++--
 src/backend/utils/misc/guc.c          | 11 +++++++++++
 src/include/replication/walproposer.h |  2 ++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 4843b10e1d9..d9d44201242 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -61,6 +61,7 @@
 
 char	   *wal_acceptors_list;
 int			wal_acceptor_reconnect_timeout;
+int			wal_acceptor_connect_timeout;
 bool		am_wal_proposer;
 
 char	   *zenith_timeline_walproposer = NULL;
@@ -313,6 +314,8 @@ WalProposerPoll(void)
 		}
 		if (rc == 0) /* timeout expired: poll state */
 		{
+			TimestampTz now;
+
 			/*
 			 * If no WAL was generated during timeout (and we have already
 			 * collected the quorum), then send pool message
@@ -321,6 +324,25 @@ WalProposerPoll(void)
 			{
 				BroadcastAppendRequest();
 			}
+
+			/*
+			 * Abandon connection attempts which take too long.
+			 */
+			now = GetCurrentTimestamp();
+			for (int i = 0; i < n_safekeepers; i++)
+			{
+				Safekeeper  *sk = &safekeeper[i];
+
+				if ((sk->state == SS_CONNECTING_WRITE ||
+				     sk->state == SS_CONNECTING_READ) &&
+					TimestampDifferenceExceeds(sk->startedConnAt, now,
+										   	   wal_acceptor_connect_timeout))
+				{
+					elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
+						 sk->host, sk->port, wal_acceptor_connect_timeout);
+					ShutdownConnection(sk);
+				}
+			}
 		}
 	}
 }
@@ -622,9 +644,10 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	elog(LOG, "Connecting with node %s:%s", sk->host, sk->port);
+	elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
 
 	sk->state = SS_CONNECTING_WRITE;
+	sk->startedConnAt = GetCurrentTimestamp();
 
 	sock = walprop_socket(sk->conn);
 	sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
@@ -803,7 +826,7 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;
 
 		case WP_CONN_POLLING_FAILED:
-			elog(WARNING, "Failed to connect to node '%s:%s': %s",
+			elog(WARNING, "failed to connect to node '%s:%s': %s",
 					sk->host, sk->port, walprop_error_message(sk->conn));
 
 			/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 95e2dbf873a..bbc1ec8e181 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2308,6 +2308,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"wal_acceptor_connect_timeout", PGC_SIGHUP, REPLICATION_STANDBY,
+			gettext_noop("Timeout after which give up connection attempt to safekeeper."),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&wal_acceptor_connect_timeout,
+		5000, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"max_connections", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
 			gettext_noop("Sets the maximum number of concurrent connections."),
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 538dcf6c5b6..19361eeaffc 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -27,6 +27,7 @@
 
 extern char* wal_acceptors_list;
 extern int   wal_acceptor_reconnect_timeout;
+extern int   wal_acceptor_connect_timeout;
 extern bool  am_wal_proposer;
 
 struct WalProposerConn; /* Defined in libpqwalproposer */
@@ -346,6 +347,7 @@ typedef struct Safekeeper
 
 	int                 eventPos;       /* position in wait event set. Equal to -1 if no event */
 	SafekeeperState     state;          /* safekeeper state machine state */
+	TimestampTz         startedConnAt;  /* when connection attempt started */
 	AcceptorGreeting    greetResponse;  /* acceptor greeting */
 	VoteResponse        voteResponse;   /* the vote */
 	AppendResponse      appendResponse; /* feedback for master */

From 17213ed8bea836a196d173509c6285cf47f0c809 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 23 Apr 2022 08:52:15 +0300
Subject: [PATCH 134/167] Avoid redundand memory allocation and sycnhronization
 in walredo (#144)

* Avoid redundand memory allocation and sycnhronization in walredo

* Address review comments

* Reduce number of temp buffers and size of inmem file storage for wal redo postgres

* Misc cleanup

Add comments on 'inmem_smgr.c', remove superfluous copy-pasted comments,
pgindent.

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 contrib/zenith/inmem_smgr.c         | 188 +++++++++++-----------------
 src/backend/storage/buffer/bufmgr.c |   7 +-
 src/backend/tcop/zenith_wal_redo.c  |  78 +++++-------
 src/include/miscadmin.h             |   3 +
 4 files changed, 112 insertions(+), 164 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index 6ad1e65b04a..bdd58731f3c 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -2,36 +2,52 @@
  *
  * inmem_smgr.c
  *
+ * This is an implementation of the SMGR interface, used in the WAL redo
+ * process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
+ * storage, the pages that are written out are kept in a small number of
+ * in-memory buffers.
+ *
+ * Normally, replaying a WAL record only needs to access a handful of
+ * buffers, which fit in the normal buffer cache, so this is just for
+ * "overflow" storage when the buffer cache is not large enough.
+ *
+ *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  contrib/zenith/inmem_smgr.c
- *
- * TODO cleanup obsolete copy-pasted comments
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+
+#include "pagestore_client.h"
 #include "storage/block.h"
+#include "storage/buf_internals.h"
 #include "storage/relfilenode.h"
-#include "pagestore_client.h"
-#include "utils/hsearch.h"
-#include "access/xlog.h"
+#include "storage/smgr.h"
 
-typedef struct
-{
-	RelFileNode node;
-	ForkNumber	forknum;
-	BlockNumber blkno;
-}			WrNodeKey;
+#define MAX_PAGES 128
 
-typedef struct
-{
-	WrNodeKey	tag;
-	char		data[BLCKSZ];
-}			WrNode;
+static BufferTag page_tag[MAX_PAGES];
+static char page_body[MAX_PAGES][BLCKSZ];
+static int	used_pages;
 
-HTAB	   *inmem_files;
+static int
+locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
+{
+	/* We only hold a small number of pages, so linear search */
+	for (int i = 0; i < used_pages; i++)
+	{
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum
+			&& blkno == page_tag[i].blockNum)
+		{
+			return i;
+		}
+	}
+	return -1;
+}
 
 /*
  *	inmem_init() -- Initialize private state
@@ -39,18 +55,7 @@ HTAB	   *inmem_files;
 void
 inmem_init(void)
 {
-	HASHCTL		hashCtl;
-
-	hashCtl.keysize = sizeof(WrNodeKey);
-	hashCtl.entrysize = sizeof(WrNode);
-
-	if (inmem_files)
-		hash_destroy(inmem_files);
-
-	inmem_files = hash_create("wal-redo files map",
-							  1024,
-							  &hashCtl,
-							  HASH_ELEM | HASH_BLOBS);
+	used_pages = 0;
 }
 
 /*
@@ -59,15 +64,15 @@ inmem_init(void)
 bool
 inmem_exists(SMgrRelation reln, ForkNumber forknum)
 {
-	WrNodeKey	key;
-
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = 0;
-	return hash_search(inmem_files,
-					   &key,
-					   HASH_FIND,
-					   NULL) != NULL;
+	for (int i = 0; i < used_pages; i++)
+	{
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum)
+		{
+			return true;
+		}
+	}
+	return false;
 }
 
 /*
@@ -82,21 +87,6 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
 /*
  *	inmem_unlink() -- Unlink a relation.
- *
- * Note that we're passed a RelFileNodeBackend --- by the time this is called,
- * there won't be an SMgrRelation hashtable entry anymore.
- *
- * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
- * to delete all forks.
- *
- *
- * If isRedo is true, it's unsurprising for the relation to be already gone.
- * Also, we should remove the file immediately instead of queuing a request
- * for later, since during redo there's no possibility of creating a
- * conflicting relation.
- *
- * Note: any failure should be reported as WARNING not ERROR, because
- * we are usually not in a transaction anymore when this is called.
  */
 void
 inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
@@ -116,17 +106,8 @@ void
 inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 char *buffer, bool skipFsync)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
-
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = blkno;
-	node = hash_search(inmem_files,
-					   &key,
-					   HASH_ENTER,
-					   NULL);
-	memcpy(node->data, buffer, BLCKSZ);
+	/* same as smgwrite() for us */
+	inmem_write(reln, forknum, blkno, buffer, skipFsync);
 }
 
 /*
@@ -156,9 +137,6 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 /*
  * inmem_writeback() -- Tell the kernel to write pages back to storage.
- *
- * This accepts a range of blocks because flushing several pages at once is
- * considerably more efficient than doing so individually.
  */
 void
 inmem_writeback(SMgrRelation reln, ForkNumber forknum,
@@ -173,20 +151,13 @@ void
 inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		   char *buffer)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
+	int			pg;
 
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = blkno;
-	node = hash_search(inmem_files,
-					   &key,
-					   HASH_FIND,
-					   NULL);
-	if (node != NULL)
-		memcpy(buffer, node->data, BLCKSZ);
-	else
+	pg = locate_page(reln, forknum, blkno);
+	if (pg < 0)
 		memset(buffer, 0, BLCKSZ);
+	else
+		memcpy(buffer, page_body[pg], BLCKSZ);
 }
 
 /*
@@ -200,17 +171,19 @@ void
 inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			char *buffer, bool skipFsync)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
+	int			pg;
+
+	pg = locate_page(reln, forknum, blocknum);
+	if (pg < 0)
+	{
+		if (used_pages == MAX_PAGES)
+			elog(ERROR, "Inmem storage overflow");
 
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = blocknum;
-	node = hash_search(inmem_files,
-					   &key,
-					   HASH_ENTER,
-					   NULL);
-	memcpy(node->data, buffer, BLCKSZ);
+		pg = used_pages;
+		used_pages++;
+		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
+	}
+	memcpy(page_body[pg], buffer, BLCKSZ);
 }
 
 /*
@@ -219,23 +192,18 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	WrNodeKey	key;
-	WrNode	   *node;
-
-	key.node = reln->smgr_rnode.node;
-	key.forknum = forknum;
-	key.blkno = 0;
+	int			nblocks = 0;
 
-	while (true)
+	for (int i = 0; i < used_pages; i++)
 	{
-		node = hash_search(inmem_files,
-						   &key,
-						   HASH_FIND,
-						   NULL);
-		if (node == NULL)
-			return key.blkno;
-		key.blkno += 1;
+		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
+			&& forknum == page_tag[i].forkNum)
+		{
+			if (page_tag[i].blockNum >= nblocks)
+				nblocks = page_tag[i].blockNum + 1;
+		}
 	}
+	return nblocks;
 }
 
 /*
@@ -248,19 +216,12 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 /*
  *	inmem_immedsync() -- Immediately sync a relation to stable storage.
- *
- * Note that only writes already issued are synced; this routine knows
- * nothing of dirty buffers that may exist inside the buffer manager.  We
- * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
- * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
- * some segment, then mdtruncate() renders that segment inactive.  If we
- * crash before the next checkpoint syncs the newly-inactive segment, that
- * segment may survive recovery, reintroducing unwanted data into the table.
  */
 void
 inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
 {
 }
+
 static const struct f_smgr inmem_smgr =
 {
 	.smgr_init = inmem_init,
@@ -283,12 +244,11 @@ static const struct f_smgr inmem_smgr =
 const f_smgr *
 smgr_inmem(BackendId backend, RelFileNode rnode)
 {
-	if (backend != InvalidBackendId && !InRecovery)
+	Assert(InRecovery);
+	if (backend != InvalidBackendId)
 		return smgr_standard(backend, rnode);
 	else
-	{
 		return &inmem_smgr;
-	}
 }
 
 void
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a90f6432701..cd304c4136e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -800,7 +800,6 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
-
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
  *
@@ -815,7 +814,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	Block		bufBlock;
 	bool		found;
 	bool		isExtend;
-	bool		isLocalBuf = SmgrIsTemp(smgr);
+	/*
+	 * wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer, 
+	 * so let's use local buffers instead
+	 */
+	bool		isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres;
 
 	*hit = false;
 
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 0ddd2ddec24..16298ea7f4f 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -99,6 +99,10 @@ static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
+bool		am_wal_redo_postgres;
+
+static XLogReaderState *reader_state;
+
 #define TRACE DEBUG5
 
 #ifdef HAVE_LIBSECCOMP
@@ -166,12 +170,20 @@ WalRedoMain(int argc, char *argv[],
 	InitStandaloneProcess(argv[0]);
 
 	SetProcessingMode(InitProcessing);
+	am_wal_redo_postgres = true;
 
 	/*
 	 * Set default values for command-line options.
 	 */
 	InitializeGUCOptions();
 
+	/*
+	 * WAL redo does not need a large number of buffers. And speed of
+	 * DropRelFileNodeAllLocalBuffers() is proportional to the number of
+	 * buffers. So let's keep it small (default value is 1024)
+	 */
+	num_temp_buffers = 4;
+
 	/*
 	 * Parse command-line options.
 	 * TODO
@@ -293,6 +305,7 @@ WalRedoMain(int argc, char *argv[],
 		if (RmgrTable[rmid].rm_startup != NULL)
 			RmgrTable[rmid].rm_startup();
 	}
+	reader_state = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(), NULL);
 
 #ifdef HAVE_LIBSECCOMP
 	/* We prefer opt-out to opt-in for greater security */
@@ -313,16 +326,13 @@ WalRedoMain(int argc, char *argv[],
 	/*
 	 * Main processing loop
 	 */
+	MemoryContextSwitchTo(MessageContext);
+	initStringInfo(&input_message);
+
 	for (;;)
 	{
-		/*
-		 * Release storage left over from prior query cycle, and create a new
-		 * query input buffer in the cleared MessageContext.
-		 */
-		MemoryContextSwitchTo(MessageContext);
-		MemoryContextResetAndDeleteChildren(MessageContext);
-
-		initStringInfo(&input_message);
+		/* Release memory left over from prior query cycle. */
+		resetStringInfo(&input_message);
 
 		set_ps_display("idle");
 
@@ -330,7 +340,6 @@ WalRedoMain(int argc, char *argv[],
 		 * (3) read a command (loop blocks here)
 		 */
 		firstchar = ReadRedoCommand(&input_message);
-
 		switch (firstchar)
 		{
 			case 'B':			/* BeginRedoForBlock */
@@ -406,23 +415,6 @@ pprint_buffer(char *data, int len)
 	return s.data;
 }
 
-static char *
-pprint_tag(BufferTag *tag)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	appendStringInfo(&s, "%u/%u/%u.%d blk %u",
-		tag->rnode.spcNode,
-		tag->rnode.dbNode,
-		tag->rnode.relNode,
-		tag->forkNum,
-		tag->blockNum
-	);
-
-	return s.data;
-}
 /* ----------------------------------------------------------------
  *		routines to obtain user input
  * ----------------------------------------------------------------
@@ -492,7 +484,6 @@ ReadRedoCommand(StringInfo inBuf)
 	return qtype;
 }
 
-
 /*
  * Prepare for WAL replay on given block
  */
@@ -502,7 +493,6 @@ BeginRedoForBlock(StringInfo input_message)
 	RelFileNode rnode;
 	ForkNumber forknum;
 	BlockNumber blknum;
-	MemoryContext oldcxt;
 	SMgrRelation reln;
 
 	/*
@@ -520,16 +510,14 @@ BeginRedoForBlock(StringInfo input_message)
 	rnode.relNode = pq_getmsgint(input_message, 4);
 	blknum = pq_getmsgint(input_message, 4);
 
-	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
 	INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
 
-	{
-		char* buf = pprint_tag(&target_redo_tag);
-		elog(TRACE, "BeginRedoForBlock %s", buf);
-		pfree(buf);
-	}
-
-	MemoryContextSwitchTo(oldcxt);
+	elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
+		 target_redo_tag.rnode.spcNode,
+		 target_redo_tag.rnode.dbNode,
+		 target_redo_tag.rnode.relNode,
+		 target_redo_tag.forkNum,
+		 target_redo_tag.blockNum);
 
 	reln = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT);
 	if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
@@ -589,7 +577,6 @@ ApplyRecord(StringInfo input_message)
 	XLogRecPtr	lsn;
 	XLogRecord *record;
 	int			nleft;
-	XLogReaderState reader_state;
 
 	/*
 	 * message format:
@@ -607,20 +594,15 @@ ApplyRecord(StringInfo input_message)
 		elog(ERROR, "mismatch between record (%d) and message size (%d)",
 			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
 
-	/* FIXME: use XLogReaderAllocate() */
-	memset(&reader_state, 0, sizeof(XLogReaderState));
-	reader_state.ReadRecPtr = 0; /* no 'prev' record */
-	reader_state.EndRecPtr = lsn; /* this record */
-	reader_state.decoded_record = record;
-	reader_state.errormsg_buf = palloc(1000 + 1); /* MAX_ERRORMSG_LEN */
-
-	if (!DecodeXLogRecord(&reader_state, record, &errormsg))
+	XLogBeginRead(reader_state, lsn);
+	reader_state->decoded_record = record;
+	if (!DecodeXLogRecord(reader_state, record, &errormsg))
 		elog(ERROR, "failed to decode WAL record: %s", errormsg);
 
 	/* Ignore any other blocks than the ones the caller is interested in */
 	redo_read_buffer_filter = redo_block_filter;
 
-	RmgrTable[record->xl_rmid].rm_redo(&reader_state);
+	RmgrTable[record->xl_rmid].rm_redo(reader_state);
 
 	redo_read_buffer_filter = NULL;
 
@@ -701,8 +683,8 @@ GetPage(StringInfo input_message)
 	} while (tot_written < BLCKSZ);
 
 	ReleaseBuffer(buf);
-	DropDatabaseBuffers(rnode.dbNode);
-	smgrinit(); //reset inmem smgr state
+	DropRelFileNodeAllLocalBuffers(rnode);
+	smgrinit();					/* reset inmem smgr state */
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 3f155ce4f84..72bd0a7ebd4 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -489,4 +489,7 @@ extern void CancelBackup(void);
 extern size_t get_hash_memory_limit(void);
 extern int	get_hash_mem(void);
 
+/* in src/backend/tcop/zenith_wal_redo.c */
+extern bool am_wal_redo_postgres;
+
 #endif							/* MISCADMIN_H */

From 3256ff321b85876026d097bc8c1df276bbf0bdff Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 25 Apr 2022 19:54:53 +0300
Subject: [PATCH 135/167] Fix missed include for InRecovery (#149)

* Fix missed include for InRecovery

* Fix missed include for InRecovery (used only in debug version with --enable--cassert)
---
 contrib/zenith/inmem_smgr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index bdd58731f3c..95e7d062f61 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -21,6 +21,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "pagestore_client.h"
 #include "storage/block.h"
 #include "storage/buf_internals.h"

From 223652c4eecb62daf45cfbaef347793ecfb27521 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 09:44:00 +0300
Subject: [PATCH 136/167] Avoid "bad syscall 39" on assertion failure in WAL
 redo process.

ExceptionalCondition calls getpid(), which is currently forbidden by
seccomp. You only get there if something else went wrong, but the "bad
syscall" error hides the underlying cause of the error, which makes
debugging hard.
---
 src/backend/tcop/zenith_wal_redo.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 16298ea7f4f..f09ae5273b2 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -125,6 +125,13 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(mmap),
 		PG_SCMP_ALLOW(munmap),
 #endif
+		/*
+		 * getpid() is called on assertion failure, in ExceptionalCondition.
+		 * It's not really needed, but seems pointless to hide it either. The
+		 * system call unlikely to expose a kernel vulnerability, and the PID
+		 * is stored in MyProcPid anyway.
+		 */
+		PG_SCMP_ALLOW(getpid),
 
 		/* Enable those for a proper shutdown.
 		PG_SCMP_ALLOW(munmap),

From 3d143d49106e8a6634c1b04adb456f0526a01e3b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 12:17:45 +0300
Subject: [PATCH 137/167] Add error context, if replaying a WAL record fails in
 WAL redo process.

---
 src/backend/access/transam/xlog.c  |  3 +--
 src/backend/tcop/zenith_wal_redo.c | 37 +++++++++++++++++++++++++++++-
 src/include/access/xlog.h          |  1 +
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 55f14ec032f..f10dcd95bdc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -976,7 +976,6 @@ static bool CheckForStandbyTrigger(void);
 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 #endif
 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
-static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 static void pg_start_backup_callback(int code, Datum arg);
 static void pg_stop_backup_callback(int code, Datum arg);
 static bool read_backup_label(XLogRecPtr *checkPointLoc,
@@ -10882,7 +10881,7 @@ xlog_block_info(StringInfo buf, XLogReaderState *record)
  * Returns a string describing an XLogRecord, consisting of its identity
  * optionally followed by a colon, a space, and a further description.
  */
-static void
+void
 xlog_outdesc(StringInfo buf, XLogReaderState *record)
 {
 	RmgrId		rmid = XLogRecGetRmid(record);
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index f09ae5273b2..ac55c7cefc6 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -93,6 +93,7 @@ static int	ReadRedoCommand(StringInfo inBuf);
 static void BeginRedoForBlock(StringInfo input_message);
 static void PushPage(StringInfo input_message);
 static void ApplyRecord(StringInfo input_message);
+static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
@@ -579,11 +580,11 @@ PushPage(StringInfo input_message)
 static void
 ApplyRecord(StringInfo input_message)
 {
-	/* recovery here */
 	char	   *errormsg;
 	XLogRecPtr	lsn;
 	XLogRecord *record;
 	int			nleft;
+	ErrorContextCallback errcallback;
 
 	/*
 	 * message format:
@@ -601,7 +602,18 @@ ApplyRecord(StringInfo input_message)
 		elog(ERROR, "mismatch between record (%d) and message size (%d)",
 			 record->xl_tot_len, (int) sizeof(XLogRecord) + nleft);
 
+	/* Setup error traceback support for ereport() */
+	errcallback.callback = apply_error_callback;
+	errcallback.arg = (void *) reader_state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
 	XLogBeginRead(reader_state, lsn);
+	/*
+	 * In lieu of calling XLogReadRecord, store the record 'decoded_record'
+	 * buffer directly.
+	 */
+	reader_state->ReadRecPtr = lsn;
 	reader_state->decoded_record = record;
 	if (!DecodeXLogRecord(reader_state, record, &errormsg))
 		elog(ERROR, "failed to decode WAL record: %s", errormsg);
@@ -613,10 +625,33 @@ ApplyRecord(StringInfo input_message)
 
 	redo_read_buffer_filter = NULL;
 
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+
 	elog(TRACE, "applied WAL record with LSN %X/%X",
 		 (uint32) (lsn >> 32), (uint32) lsn);
 }
 
+/*
+ * Error context callback for errors occurring during ApplyRecord
+ */
+static void
+apply_error_callback(void *arg)
+{
+	XLogReaderState *record = (XLogReaderState *) arg;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+	xlog_outdesc(&buf, record);
+
+	/* translator: %s is a WAL record description */
+	errcontext("WAL redo at %X/%X for %s",
+			   LSN_FORMAT_ARGS(record->ReadRecPtr),
+			   buf.data);
+
+	pfree(buf.data);
+}
+
 static bool
 redo_block_filter(XLogReaderState *record, uint8 block_id)
 {
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index e34f1deaf6e..f35e3686cf8 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -307,6 +307,7 @@ extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
 extern void xlog_redo(XLogReaderState *record);
 extern void xlog_desc(StringInfo buf, XLogReaderState *record);
 extern const char *xlog_identify(uint8 info);
+extern void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 
 extern void issue_xlog_fsync(int fd, XLogSegNo segno);
 

From 352e286ec76d16169ba425ebd56f9185fa9b76a7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 13:19:01 +0300
Subject: [PATCH 138/167] Add WARNING for debugging purposes.

---
 contrib/zenith/inmem_smgr.c        | 4 ++++
 src/backend/tcop/zenith_wal_redo.c | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index 95e7d062f61..dbc780624a8 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -195,6 +195,10 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	int			nblocks = 0;
 
+	/*
+	 * Find the hightest-numbered page, and report that as the relation size.
+	 * XXX: Why does this get called during WAL replay at all?
+	 */
 	for (int i = 0; i < used_pages; i++)
 	{
 		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index ac55c7cefc6..aa423a0e2dc 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -664,6 +664,14 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
 		elog(PANIC, "failed to locate backup block with ID %d", block_id);
 	}
 
+	/*
+	 * Can a WAL redo function ever access a relation other than the one that
+	 * it modifies? I don't see why it would.
+	 */
+	if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
+		elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
+			 target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
+
 	/*
 	 * If this block isn't one we are currently restoring, then return 'true'
 	 * so that this gets ignored

From d2afbf7f1ca407a5dbb8599336c1b5d7254dafa9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 14:20:49 +0300
Subject: [PATCH 139/167] Turn Assertion into elog(ERROR), to help with
 debugging.

This error is happening in the 'pg_regress' test in the CI, but not on
my laptop. Turn it into an ERROR, so that we get the error context and
backtrace of it.
---
 src/backend/storage/smgr/smgr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 10a6f65c118..f1e676bcc3e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -194,7 +194,11 @@ smgropen(RelFileNode rnode, BackendId backend, char relpersistence)
 		if (reln->smgr_relpersistence == 0)
 			reln->smgr_relpersistence = relpersistence;
 		else
-			Assert(relpersistence == 0 || reln->smgr_relpersistence == relpersistence);
+		{
+			if (!(relpersistence == 0 || reln->smgr_relpersistence == relpersistence))
+				elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c",
+					 relpersistence, reln->smgr_relpersistence);
+		}
 	}
 
 	return reln;

From 20c37c0565b86e07f2d22576767770cb1d159a2c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 26 Apr 2022 15:05:50 +0300
Subject: [PATCH 140/167] Fix errors in WAL redo about relpersistence mismatch.

In the WAL redo process, even "permanent" buffers are stored in the
local buffer cache. Need to pass RELPERSISTENCE_PERMANENT to smgropen()
in that case.
---
 src/backend/storage/buffer/localbuf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index b9811cc7327..3184b1e5686 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -18,6 +18,7 @@
 #include "access/parallel.h"
 #include "catalog/catalog.h"
 #include "executor/instrument.h"
+#include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "utils/guc.h"
@@ -215,7 +216,10 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 		/* Find smgr relation for buffer */
-		oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP);
+		if (am_wal_redo_postgres && MyBackendId == InvalidBackendId)
+			oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_PERMANENT);
+		else
+			oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP);
 
 		PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
 

From fcd2af7585436e60d4d5f60348790f6f8c944640 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 26 Apr 2022 18:51:16 +0400
Subject: [PATCH 141/167] Don't log 'last written LSN ahead of flushed'.

That's a valid case, as edited comment says.

https://github.com/neondatabase/neon/issues/1303
---
 contrib/zenith/pagestore_smgr.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index 18c55fa5cdc..caa77a59091 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -592,14 +592,16 @@ zenith_get_request_lsn(bool *latest)
 
 		/*
 		 * Is it possible that the last-written LSN is ahead of last flush
-		 * LSN? Probably not, we shouldn't evict a page from the buffer cache
+		 * LSN? Generally not, we shouldn't evict a page from the buffer cache
 		 * before all its modifications have been safely flushed. That's the
-		 * "WAL before data" rule. But better safe than sorry.
+		 * "WAL before data" rule. However, such case does exist at index building,
+		 * _bt_blwritepage logs the full page without flushing WAL before
+		 * smgrextend (files are fsynced before build ends).
 		 */
 		flushlsn = GetFlushRecPtr();
 		if (lsn > flushlsn)
 		{
-			elog(LOG, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
+			elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
 				 (uint32) (lsn >> 32), (uint32) lsn,
 				 (uint32) (flushlsn >> 32), (uint32) flushlsn);
 			XLogFlush(lsn);

From a13fe64a3eff1743ff17141a2e6057f5103829f0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 27 Apr 2022 17:36:20 +0300
Subject: [PATCH 142/167] Perform inmem_smgr cleaup after processing each
 record (#154)

* Perform inmem_smgr cleaup after processing each record

* Prevent eviction of wal redo target page

* Prevent eviction of wal redo target page frmo temp buffers
---
 contrib/zenith/inmem_smgr.c           | 6 +-----
 src/backend/storage/buffer/localbuf.c | 8 ++++++++
 src/backend/tcop/zenith_wal_redo.c    | 5 ++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index dbc780624a8..abc600f0b4a 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -28,7 +28,7 @@
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 
-#define MAX_PAGES 128
+#define MAX_PAGES 32
 
 static BufferTag page_tag[MAX_PAGES];
 static char page_body[MAX_PAGES][BLCKSZ];
@@ -195,10 +195,6 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	int			nblocks = 0;
 
-	/*
-	 * Find the hightest-numbered page, and report that as the relation size.
-	 * XXX: Why does this get called during WAL replay at all?
-	 */
 	for (int i = 0; i < used_pages; i++)
 	{
 		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3184b1e5686..f22ec0d82df 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -25,6 +25,8 @@
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
 
+/* ZENITH: prevent eviction of the buffer of target page */
+extern Buffer wal_redo_buffer;
 
 /*#define LBDEBUG*/
 
@@ -183,6 +185,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
+			if (-b - 1 == wal_redo_buffer)
+			{
+				/* ZENITH: Prevent eviction of the buffer with target wal redo page */
+				continue;
+			}
+
 			buf_state = pg_atomic_read_u32(&bufHdr->state);
 
 			if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index aa423a0e2dc..68f29564328 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -100,6 +100,7 @@ static ssize_t buffered_read(void *buf, size_t count);
 
 static BufferTag target_redo_tag;
 
+Buffer		wal_redo_buffer;
 bool		am_wal_redo_postgres;
 
 static XLogReaderState *reader_state;
@@ -566,6 +567,7 @@ PushPage(StringInfo input_message)
 	content = pq_getmsgbytes(input_message, BLCKSZ);
 
 	buf = ReadBufferWithoutRelcache(rnode, forknum, blknum, RBM_ZERO_AND_LOCK, NULL);
+	wal_redo_buffer = buf;
 	page = BufferGetPage(buf);
 	memcpy(page, content, BLCKSZ);
 	MarkBufferDirty(buf); /* pro forma */
@@ -594,6 +596,8 @@ ApplyRecord(StringInfo input_message)
 	 */
 	lsn = pq_getmsgint64(input_message);
 
+	smgrinit();					/* reset inmem smgr state */
+
 	/* note: the input must be aligned here */
 	record = (XLogRecord *) pq_getmsgbytes(input_message, sizeof(XLogRecord));
 
@@ -734,7 +738,6 @@ GetPage(StringInfo input_message)
 
 	ReleaseBuffer(buf);
 	DropRelFileNodeAllLocalBuffers(rnode);
-	smgrinit();					/* reset inmem smgr state */
 
 	elog(TRACE, "Page sent back for block %u", blknum);
 }

From 868e7be7ff7dd1d026917892b3951f812e9d4a08 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 3 May 2022 13:03:19 +0300
Subject: [PATCH 143/167] Avoid extending relation in the WAL redo process.

It's a waste of time, and otherwise you can run into the MAX_PAGES limit.

Fixes https://github.com/neondatabase/neon/issues/1615
---
 contrib/zenith/inmem_smgr.c | 38 +++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index abc600f0b4a..1d8aa9ac2ee 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -177,12 +177,27 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	pg = locate_page(reln, forknum, blocknum);
 	if (pg < 0)
 	{
+		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum,
+			 blocknum,
+			 used_pages);
 		if (used_pages == MAX_PAGES)
 			elog(ERROR, "Inmem storage overflow");
 
 		pg = used_pages;
 		used_pages++;
 		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
+	}  else {
+		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
+			 reln->smgr_rnode.node.spcNode,
+			 reln->smgr_rnode.node.dbNode,
+			 reln->smgr_rnode.node.relNode,
+			 forknum,
+			 blocknum,
+			 used_pages);
 	}
 	memcpy(page_body[pg], buffer, BLCKSZ);
 }
@@ -193,18 +208,17 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 BlockNumber
 inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	int			nblocks = 0;
-
-	for (int i = 0; i < used_pages; i++)
-	{
-		if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
-			&& forknum == page_tag[i].forkNum)
-		{
-			if (page_tag[i].blockNum >= nblocks)
-				nblocks = page_tag[i].blockNum + 1;
-		}
-	}
-	return nblocks;
+	/*
+	 * It's not clear why a WAL redo function would call smgrnblocks().
+	 * During recovery, at least before reaching consistency, the size of a
+	 * relation could be arbitrarily small, if it was truncated after the
+	 * record being replayed, or arbitrarily large if it was extended
+	 * afterwards. But one place where it's called is in
+	 * XLogReadBufferExtended(): it extends the relation, if it's smaller than
+	 * the requested page. That's a waste of time in the WAL redo
+	 * process. Pretend that all relations are maximally sized to avoid it.
+	 */
+	return MaxBlockNumber;
 }
 
 /*

From ae5f31497b71875ed6600651ce3764c1dbfe819d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sun, 1 May 2022 17:02:23 +0400
Subject: [PATCH 144/167] Send timeline_start_lsn in Elected and receive it in
 VoteResponse messages.

To support remembering it on safekeeper. Currently compute doesn't know initial
LSN on non-first boot (though it could get it from pageserver in theory), so we
rely on safekeepers to fetch it back.

While changing the protocol, also add node_id to AcceptorProposerGreeting.
---
 src/backend/replication/walproposer.c | 75 ++++++++++++++++++---------
 src/include/replication/walproposer.h |  9 +++-
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index d9d44201242..21a538fd603 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -99,6 +99,7 @@ static TermHistory propTermHistory; /* term history of the proposer */
 static XLogRecPtr propEpochStartLsn;	/* epoch start lsn of the proposer */
 static term_t donorEpoch;		/* Most advanced acceptor epoch */
 static int	donor;				/* Most advanced acceptor */
+static XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
 static int	n_votes = 0;
 static int	n_connected = 0;
 static TimestampTz last_reconnect_attempt;
@@ -767,7 +768,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 			if (!AsyncFlush(sk))
 				return;
-			
+
 			/* flush is done, event set and state will be updated later */
 			StartStreaming(sk);
 			break;
@@ -977,7 +978,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	}
 	else if (sk->greetResponse.term > propTerm)
 	{
-		/* Another compute with higher term is running. */	
+		/* Another compute with higher term is running. */
 		elog(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
 				sk->host, sk->port,
 				sk->greetResponse.term, propTerm);
@@ -1037,10 +1038,11 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	elog(LOG,
-			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+			"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
 			sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
 			LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn));
+			LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+			LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only
@@ -1081,7 +1083,7 @@ RecvVoteResponse(Safekeeper *sk)
 /*
  * Called once a majority of acceptors have voted for us and current proposer
  * has been elected.
- * 
+ *
  * Sends ProposerElected message to all acceptors in SS_IDLE state and starts
  * replication from walsender.
  */
@@ -1118,7 +1120,7 @@ HandleElectedProposer(void)
 			SendProposerElected(&safekeeper[i]);
 	}
 
-	/* 
+	/*
 	 * The proposer has been elected, and there will be no quorum waiting
 	 * after this point. There will be no safekeeper with state SS_IDLE
 	 * also, because that state is used only for quorum waiting.
@@ -1173,6 +1175,7 @@ DetermineEpochStartLsn(void)
 	propEpochStartLsn = InvalidXLogRecPtr;
 	donorEpoch = 0;
 	truncateLsn = InvalidXLogRecPtr;
+	timelineStartLsn = InvalidXLogRecPtr;
 
 	for (int i = 0; i < n_safekeepers; i++)
 	{
@@ -1187,6 +1190,20 @@ DetermineEpochStartLsn(void)
 				donor = i;
 			}
 			truncateLsn = Max(safekeeper[i].voteResponse.truncateLsn, truncateLsn);
+
+			if (safekeeper[i].voteResponse.timelineStartLsn != InvalidXLogRecPtr)
+			{
+				/* timelineStartLsn should be the same everywhere or unknown */
+				if (timelineStartLsn != InvalidXLogRecPtr &&
+					timelineStartLsn != safekeeper[i].voteResponse.timelineStartLsn)
+				{
+					elog(WARNING,
+						 "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						 LSN_FORMAT_ARGS(timelineStartLsn),
+						 LSN_FORMAT_ARGS(safekeeper[i].voteResponse.timelineStartLsn));
+				}
+				timelineStartLsn = safekeeper[i].voteResponse.timelineStartLsn;
+			}
 		}
 	}
 
@@ -1194,12 +1211,16 @@ DetermineEpochStartLsn(void)
 	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
 	 * was committed yet. To keep the idea of always starting streaming since
 	 * record boundary (which simplifies decoding on safekeeper), take start
-	 * position of the slot.
+	 * position of the slot. TODO: take it from .signal file.
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
 		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
 		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		if (timelineStartLsn == InvalidXLogRecPtr)
+		{
+			timelineStartLsn = MyReplicationSlot->data.restart_lsn;
+		}
 		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
 	}
@@ -1332,7 +1353,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
  *    safekeeper is synced, being important for sync-safekeepers)
  * 2) Communicating starting streaming point -- safekeeper must truncate its WAL
  *    beyond it -- and history of term switching.
- * 
+ *
  * Sets sk->startStreamingAt.
  */
 static void
@@ -1343,7 +1364,7 @@ SendProposerElected(Safekeeper *sk)
 	term_t lastCommonTerm;
 	int i;
 
-	/* 
+	/*
 	 * Determine start LSN by comparing safekeeper's log term switch history and
 	 * proposer's, searching for the divergence point.
 	 *
@@ -1352,7 +1373,7 @@ SendProposerElected(Safekeeper *sk)
 	 * wrote some WAL on single sk and died; we stream since the beginning then.
 	 */
 	th = &sk->voteResponse.termHistory;
-	/* 
+	/*
 	 * If any WAL is present on the sk, it must be authorized by some term.
 	 * OTOH, without any WAL there are no term swiches in the log.
 	 */
@@ -1382,11 +1403,11 @@ SendProposerElected(Safekeeper *sk)
 			 * that all safekeepers reported that they have persisted WAL up
 			 * to the truncateLsn before, but now current safekeeper tells
 			 * otherwise.
-			 * 
+			 *
 			 * Also we have a special condition here, which is empty safekeeper
 			 * with no history. In combination with a gap, that can happen when
 			 * we introduce a new safekeeper to the cluster. This is a rare case,
-			 * which is triggered manually for now, and should be treated with 
+			 * which is triggered manually for now, and should be treated with
 			 * care.
 			 */
 
@@ -1429,12 +1450,13 @@ SendProposerElected(Safekeeper *sk)
 	msg.term = propTerm;
 	msg.startStreamingAt = sk->startStreamingAt;
 	msg.termHistory = &propTermHistory;
+	msg.timelineStartLsn = timelineStartLsn;
 
 	lastCommonTerm = i >= 0 ? propTermHistory.entries[i].term : 0;
 	elog(LOG,
-		 "sending elected msg term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s",
-		 msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port);
-	
+		 "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		 sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+
 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
 	pq_sendint64_le(&sk->outbuf, msg.term);
@@ -1445,6 +1467,7 @@ SendProposerElected(Safekeeper *sk)
 		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].term);
 		pq_sendint64_le(&sk->outbuf, msg.termHistory->entries[i].lsn);
 	}
+	pq_sendint64_le(&sk->outbuf, msg.timelineStartLsn);
 
 	if (!AsyncWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_SEND_ELECTED_FLUSH))
 		return;
@@ -1475,7 +1498,7 @@ WalProposerStartStreaming(XLogRecPtr startpos)
 static void
 StartStreaming(Safekeeper *sk)
 {
-	/* 
+	/*
 	 * This is the only entrypoint to state SS_ACTIVE. It's executed
 	 * exactly once for a connection.
 	 */
@@ -1546,7 +1569,7 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	/*
 	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
 	 * in the buffer.
-	 * 
+	 *
 	 * LSN comparison checks if we have pending unsent messages. This check isn't
 	 * necessary now, because we always send append messages immediately after
 	 * arrival. But it's good to have it here in case we change this behavior
@@ -1561,9 +1584,9 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 /*
  * Send WAL messages starting from sk->streamingAt until the end or non-writable
  * socket, whichever comes first. Caller should take care of updating event set.
- * Even if no unsent WAL is available, at least one empty message will be sent 
+ * Even if no unsent WAL is available, at least one empty message will be sent
  * as a heartbeat, if socket is ready.
- * 
+ *
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
  */
@@ -1579,7 +1602,7 @@ SendAppendRequests(Safekeeper *sk)
 	if (sk->flushWrite)
 	{
 		if (!AsyncFlush(sk))
-			/* 
+			/*
 			 * AsyncFlush failed, that could happen if the socket is closed or
 			 * we have nothing to write and should wait for writeable socket.
 			 */
@@ -1631,7 +1654,7 @@ SendAppendRequests(Safekeeper *sk)
 		sk->outbuf.len += req->endLsn - req->beginLsn;
 
 		writeResult = walprop_async_write(sk->conn, sk->outbuf.data, sk->outbuf.len);
-		
+
 		/* Mark current message as sent, whatever the result is */
 		sk->streamingAt = endLsn;
 
@@ -1669,7 +1692,7 @@ SendAppendRequests(Safekeeper *sk)
  *
  * Can change state if Async* functions encounter errors and reset connection.
  * Returns false in this case, true otherwise.
- * 
+ *
  * NB: This function can call SendMessageToNode and produce new messages.
  */
 static bool
@@ -1988,7 +2011,7 @@ HandleSafekeeperResponse(void)
 
 	/*
 	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the 
+	 * flushed to all safekeepers. We must always start streaming from the
 	 * beginning of the record, which simplifies decoding on the far end.
 	 *
 	 * Advanced truncateLsn should be not further than nearest commitLsn.
@@ -2051,7 +2074,7 @@ HandleSafekeeperResponse(void)
 	}
 }
 
-/* 
+/*
  * Try to read CopyData message from i'th safekeeper, resetting connection on
  * failure.
  */
@@ -2082,7 +2105,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
  * Read next message with known type into provided struct, by reading a CopyData
  * block from the safekeeper's postgres connection, returning whether the read
  * was successful.
- * 
+ *
  * If the read needs more polling, we return 'false' and keep the state
  * unmodified, waiting until it becomes read-ready to try again. If it fully
  * failed, a warning is emitted and the connection is reset.
@@ -2118,6 +2141,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 		{
 			AcceptorGreeting *msg = (AcceptorGreeting *) anymsg;
 			msg->term = pq_getmsgint64_le(&s);
+			msg->nodeId = pq_getmsgint64_le(&s);
 			pq_getmsgend(&s);
 			return true;
 		}
@@ -2137,6 +2161,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 				msg->termHistory.entries[i].term = pq_getmsgint64_le(&s);
 				msg->termHistory.entries[i].lsn = pq_getmsgint64_le(&s);
 			}
+			msg->timelineStartLsn = pq_getmsgint64_le(&s);
 			pq_getmsgend(&s);
 			return true;
 		}
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 19361eeaffc..1fcaaa3fc11 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -11,7 +11,7 @@
 #include "replication/walreceiver.h"
 
 #define SK_MAGIC              0xCafeCeefu
-#define SK_PROTOCOL_VERSION   1
+#define SK_PROTOCOL_VERSION   2
 
 #define MAX_SAFEKEEPERS        32
 #define MAX_SEND_SIZE         (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
@@ -147,6 +147,9 @@ typedef enum
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
+/* neon storage node id */
+typedef uint64 NNodeId;
+
 /*
  * Proposer <-> Acceptor messaging.
  */
@@ -177,6 +180,7 @@ typedef struct AcceptorGreeting
 {
 	AcceptorProposerMessage apm;
 	term_t		term;
+	NNodeId		nodeId;
 } AcceptorGreeting;
 
 /*
@@ -214,6 +218,7 @@ typedef struct VoteResponse {
 	XLogRecPtr flushLsn;
 	XLogRecPtr truncateLsn;  /* minimal LSN which may be needed for recovery of some safekeeper */
 	TermHistory termHistory;
+	XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
 } VoteResponse;
 
 /*
@@ -228,6 +233,8 @@ typedef struct ProposerElected
 	XLogRecPtr startStreamingAt;
 	/* history of term switches up to this proposer */
 	TermHistory *termHistory;
+	/* timeline globally starts at this LSN */
+	XLogRecPtr timelineStartLsn;
 } ProposerElected;
 
 /*

From ce3057955ac962662c6fe0d00d793bfccedf7ca8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 2 May 2022 19:32:27 +0400
Subject: [PATCH 145/167] Verify basebackup LSN against consensus LSN in
 walproposer.

If not, such basebackup (clog etc) is inconsistent and must be retaken.

Basebackup LSN is taken by exposing xlog.c RedoStartLSN in shmem.

ref https://github.com/neondatabase/neon/issues/594
---
 src/backend/access/transam/xlog.c     | 16 ++++++
 src/backend/replication/walproposer.c | 77 ++++++++++++++++++---------
 src/backend/storage/ipc/ipci.c        |  4 +-
 src/include/access/xlog.h             |  2 +
 src/include/replication/walproposer.h | 10 ++--
 5 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f10dcd95bdc..fc61ef9c084 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -750,6 +750,9 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 	XLogRecPtr  lastWrittenPageLSN;
 
+	/* neon: copy of startup's RedoStartLSN for walproposer's use */
+	XLogRecPtr	RedoStartLSN;
+
 	/*
 	 * size of a timeline in zenith pageserver.
 	 * used to enforce timeline size limit.
@@ -6896,6 +6899,8 @@ StartupXLOG(void)
 
 		checkPointLoc = zenithLastRec;
 		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		/* make basebackup LSN available for walproposer */
+		XLogCtl->RedoStartLSN = RedoStartLSN;
 		EndRecPtr = ControlFile->checkPointCopy.redo;
 
 		memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint));
@@ -6966,6 +6971,7 @@ StartupXLOG(void)
 		/* Get the last valid checkpoint record. */
 		checkPointLoc = ControlFile->checkPoint;
 		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		XLogCtl->RedoStartLSN = RedoStartLSN;
 		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
 		if (record != NULL)
 		{
@@ -8830,6 +8836,16 @@ SetLastWrittenPageLSN(XLogRecPtr lsn)
 	SpinLockRelease(&XLogCtl->info_lck);
 }
 
+/*
+ * RedoStartLsn is set only once by startup process, locking is not required
+ * after its exit.
+ */
+XLogRecPtr
+GetRedoStartLsn(void)
+{
+	return XLogCtl->RedoStartLSN;
+}
+
 
 uint64
 GetZenithCurrentClusterSize(void)
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 21a538fd603..a1b179d4be3 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -29,8 +29,6 @@
  *         safekeepers, learn start LSN of future epoch and run basebackup'
  *         won't work.
  *
- *      TODO: check that LSN on safekeepers after start is the same as it was
- *            after `postgres --sync-safekeepers`.
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
@@ -107,6 +105,8 @@ static TimestampTz last_reconnect_attempt;
 /* Set to true only in standalone run of `postgres --sync-safekeepers` (see comment on top) */
 static bool syncSafekeepers;
 
+static WalproposerShmemState *walprop_shared;
+
 /* Prototypes for private functions */
 static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
@@ -1208,20 +1208,16 @@ DetermineEpochStartLsn(void)
 	}
 
 	/*
-	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing
-	 * was committed yet. To keep the idea of always starting streaming since
-	 * record boundary (which simplifies decoding on safekeeper), take start
-	 * position of the slot. TODO: take it from .signal file.
+	 * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing was
+	 * committed yet. Start streaming then from the basebackup LSN.
 	 */
 	if (propEpochStartLsn == InvalidXLogRecPtr && !syncSafekeepers)
 	{
-		(void) ReplicationSlotAcquire(WAL_PROPOSER_SLOT_NAME, true);
-		propEpochStartLsn = truncateLsn = MyReplicationSlot->data.restart_lsn;
+		propEpochStartLsn = truncateLsn = GetRedoStartLsn();
 		if (timelineStartLsn == InvalidXLogRecPtr)
 		{
-			timelineStartLsn = MyReplicationSlot->data.restart_lsn;
+			timelineStartLsn = GetRedoStartLsn();
 		}
-		ReplicationSlotRelease();
 		elog(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(propEpochStartLsn));
 	}
 
@@ -1256,6 +1252,32 @@ DetermineEpochStartLsn(void)
 		 safekeeper[donor].host, safekeeper[donor].port,
 		 LSN_FORMAT_ARGS(truncateLsn)
 		);
+
+	/*
+	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN since
+	 * which we are going to write according to the consensus. If not, we must
+	 * bail out, as clog and other non rel data is inconsistent.
+	 */
+	if (!syncSafekeepers)
+	{
+		if (propEpochStartLsn != GetRedoStartLsn())
+		{
+			/*
+			 * However, allow to proceed if previously elected leader was me; plain
+			 * restart of walproposer not intervened by concurrent compute (who could
+			 * generate WAL) is ok.
+			 */
+			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
+											walprop_shared->mineLastElectedTerm)))
+			{
+				elog(FATAL,
+					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					 LSN_FORMAT_ARGS(propEpochStartLsn),
+					 LSN_FORMAT_ARGS(GetRedoStartLsn()));
+			}
+		}
+		walprop_shared->mineLastElectedTerm = propTerm;
+	}
 }
 
 /*
@@ -1880,27 +1902,30 @@ GetAcknowledgedByQuorumWALPosition(void)
 	return responses[n_safekeepers - quorum];
 }
 
-
-static ZenithFeedbackState *zf_state;
-
 /*
  * ZenithFeedbackShmemSize --- report amount of shared memory space needed
  */
 Size
-ZenithFeedbackShmemSize(void)
+WalproposerShmemSize(void)
 {
-	return sizeof(ZenithFeedbackState);
+	return sizeof(WalproposerShmemState);
 }
 
 bool
-ZenithFeedbackShmemInit(void)
+WalproposerShmemInit(void)
 {
 	bool		found;
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	zf_state = ShmemInitStruct("Zenith Feedback",
-								sizeof(ZenithFeedbackState),
+	walprop_shared = ShmemInitStruct("Walproposer shared state",
+								sizeof(WalproposerShmemState),
 								&found);
+
+	if (!found)
+	{
+		memset(walprop_shared, 0, WalproposerShmemSize());
+		SpinLockInit(&walprop_shared->mutex);
+	}
 	LWLockRelease(AddinShmemInitLock);
 
 	return found;
@@ -1909,20 +1934,20 @@ ZenithFeedbackShmemInit(void)
 void
 zenith_feedback_set(ZenithFeedback *zf)
 {
-	SpinLockAcquire(&zf_state->mutex);
-	memcpy(&zf_state->feedback, zf, sizeof(ZenithFeedback));
-	SpinLockRelease(&zf_state->mutex);
+	SpinLockAcquire(&walprop_shared->mutex);
+	memcpy(&walprop_shared->feedback, zf, sizeof(ZenithFeedback));
+	SpinLockRelease(&walprop_shared->mutex);
 }
 
 
 void
 zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
-	SpinLockAcquire(&zf_state->mutex);
-	*writeLsn = zf_state->feedback.ps_writelsn;
-	*flushLsn = zf_state->feedback.ps_flushlsn;
-	*applyLsn = zf_state->feedback.ps_applylsn;
-	SpinLockRelease(&zf_state->mutex);
+	SpinLockAcquire(&walprop_shared->mutex);
+	*writeLsn = walprop_shared->feedback.ps_writelsn;
+	*flushLsn = walprop_shared->feedback.ps_flushlsn;
+	*applyLsn = walprop_shared->feedback.ps_applylsn;
+	SpinLockRelease(&walprop_shared->mutex);
 }
 
 
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 5fb07a87eb8..233bd081f82 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -152,7 +152,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
 
-		size = add_size(size, ZenithFeedbackShmemSize());
+		size = add_size(size, WalproposerShmemSize());
 
 #ifdef EXEC_BACKEND
 		size = add_size(size, ShmemBackendArraySize());
@@ -274,7 +274,7 @@ CreateSharedMemoryAndSemaphores(void)
 	SyncScanShmemInit();
 	AsyncShmemInit();
 
-	ZenithFeedbackShmemInit();
+	WalproposerShmemInit();
 
 #ifdef EXEC_BACKEND
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f35e3686cf8..66fe9dfcd9e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -354,6 +354,8 @@ extern void RemovePromoteSignalFiles(void);
 extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
 extern XLogRecPtr GetLastWrittenPageLSN(void);
 
+extern XLogRecPtr GetRedoStartLsn(void);
+
 extern void SetZenithCurrentClusterSize(uint64 size);
 extern uint64 GetZenithCurrentClusterSize(void);
 
diff --git a/src/include/replication/walproposer.h b/src/include/replication/walproposer.h
index 1fcaaa3fc11..09743380bc7 100644
--- a/src/include/replication/walproposer.h
+++ b/src/include/replication/walproposer.h
@@ -283,12 +283,12 @@ typedef	struct ZenithFeedback
 } ZenithFeedback;
 
 
-typedef struct ZenithFeedbackState
+typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
 	ZenithFeedback feedback;
-
-} ZenithFeedbackState;
+	term_t		mineLastElectedTerm;
+} WalproposerShmemState;
 
 /*
  * Report safekeeper state to proposer
@@ -393,8 +393,8 @@ void ParseZenithFeedbackMessage(StringInfo reply_message,
 void       StartReplication(StartReplicationCmd *cmd);
 void       WalProposerSync(int argc, char *argv[]);
 
-Size ZenithFeedbackShmemSize(void);
-bool ZenithFeedbackShmemInit(void);
+Size WalproposerShmemSize(void);
+bool WalproposerShmemInit(void);
 void zenith_feedback_set(ZenithFeedback *zf);
 void zenith_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 

From d35bd7132ff6ed600577934e5389c7657087fbe1 Mon Sep 17 00:00:00 2001
From: anastasia <lubennikovaav@gmail.com>
Date: Fri, 23 Jul 2021 15:56:58 +0300
Subject: [PATCH 146/167] Implement pg_database_size(): - extend zenith
 pageserver API to handle new request type; - add dbsize_hook to intercept
 db_dir_size() call.

---
 contrib/zenith/libpagestore.c     |   1 +
 contrib/zenith/pagestore_client.h |  19 ++++++
 contrib/zenith/pagestore_smgr.c   | 100 ++++++++++++++++++++++++++++++
 src/backend/utils/adt/dbsize.c    |   9 +++
 src/include/storage/smgr.h        |   3 +
 5 files changed, 132 insertions(+)

diff --git a/contrib/zenith/libpagestore.c b/contrib/zenith/libpagestore.c
index d8ec3eba81d..600e3d791c6 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/zenith/libpagestore.c
@@ -457,5 +457,6 @@ _PG_init(void)
 		zenith_log(PqPageStoreTrace, "set zenith_smgr hook");
 		smgr_hook = smgr_zenith;
 		smgr_init_hook = smgr_init_zenith;
+		dbsize_hook = zenith_dbsize;
 	}
 }
diff --git a/contrib/zenith/pagestore_client.h b/contrib/zenith/pagestore_client.h
index a5dcd1efc06..051dc6bc9a1 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/zenith/pagestore_client.h
@@ -31,14 +31,18 @@ typedef enum
 	T_ZenithExistsRequest = 0,
 	T_ZenithNblocksRequest,
 	T_ZenithGetPageRequest,
+	T_ZenithDbSizeRequest,
 
 	/* pagestore -> pagestore_client */
 	T_ZenithExistsResponse = 100,
 	T_ZenithNblocksResponse,
 	T_ZenithGetPageResponse,
 	T_ZenithErrorResponse,
+	T_ZenithDbSizeResponse,
 } ZenithMessageTag;
 
+
+
 /* base struct for c-style inheritance */
 typedef struct
 {
@@ -75,6 +79,14 @@ typedef struct
 	ForkNumber	forknum;
 } ZenithNblocksRequest;
 
+
+typedef struct
+{
+	ZenithRequest req;
+	Oid dbNode;
+} ZenithDbSizeRequest;
+
+
 typedef struct
 {
 	ZenithRequest req;
@@ -107,6 +119,12 @@ typedef struct
 	char		page[FLEXIBLE_ARRAY_MEMBER];
 } ZenithGetPageResponse;
 
+typedef struct
+{
+	ZenithMessageTag tag;
+	int64		db_size;
+} ZenithDbSizeResponse;
+
 typedef struct
 {
 	ZenithMessageTag tag;
@@ -165,6 +183,7 @@ extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
 extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 							 BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern int64 zenith_dbsize(Oid dbNode);
 extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
 							BlockNumber nblocks);
 extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/zenith/pagestore_smgr.c
index caa77a59091..544250bb55d 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/zenith/pagestore_smgr.c
@@ -143,6 +143,16 @@ zm_pack_request(ZenithRequest *msg)
 
 				break;
 			}
+		case T_ZenithDbSizeRequest:
+			{
+				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
+
+					pq_sendbyte(&s, msg_req->req.latest);
+					pq_sendint64(&s, msg_req->req.lsn);
+					pq_sendint32(&s, msg_req->dbNode);
+
+					break;
+			}
 		case T_ZenithGetPageRequest:
 			{
 				ZenithGetPageRequest *msg_req = (ZenithGetPageRequest *) msg;
@@ -163,6 +173,7 @@ zm_pack_request(ZenithRequest *msg)
 		case T_ZenithNblocksResponse:
 		case T_ZenithGetPageResponse:
 		case T_ZenithErrorResponse:
+		case T_ZenithDbSizeResponse:
 		default:
 			elog(ERROR, "unexpected zenith message tag 0x%02x", msg->tag);
 			break;
@@ -216,6 +227,18 @@ zm_unpack_response(StringInfo s)
 				break;
 			}
 
+		case T_ZenithDbSizeResponse:
+			{
+				ZenithDbSizeResponse *msg_resp = palloc0(sizeof(ZenithDbSizeResponse));
+
+				msg_resp->tag = tag;
+				msg_resp->db_size = pq_getmsgint64(s);
+				pq_getmsgend(s);
+
+				resp = (ZenithResponse *) msg_resp;
+				break;
+			}
+
 		case T_ZenithErrorResponse:
 			{
 				ZenithErrorResponse *msg_resp;
@@ -242,6 +265,7 @@ zm_unpack_response(StringInfo s)
 		case T_ZenithExistsRequest:
 		case T_ZenithNblocksRequest:
 		case T_ZenithGetPageRequest:
+		case T_ZenithDbSizeRequest:
 		default:
 			elog(ERROR, "unexpected zenith message tag 0x%02x", tag);
 			break;
@@ -309,6 +333,18 @@ zm_to_string(ZenithMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_ZenithDbSizeRequest:
+			{
+				ZenithDbSizeRequest *msg_req = (ZenithDbSizeRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeRequest\"");
+				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
 
 			/* pagestore -> pagestore_client */
 		case T_ZenithExistsResponse:
@@ -356,6 +392,18 @@ zm_to_string(ZenithMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_ZenithDbSizeResponse:
+			{
+				ZenithDbSizeResponse *msg_resp = (ZenithDbSizeResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"ZenithDbSizeResponse\"");
+				appendStringInfo(&s, ", \"db_size\": %ld}",
+								 msg_resp->db_size
+					);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
 
 		default:
 			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
@@ -1286,6 +1334,58 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 	return n_blocks;
 }
 
+/*
+ *	zenith_db_size() -- Get the size of the database in bytes.
+ */
+int64
+zenith_dbsize(Oid dbNode)
+{
+	ZenithResponse *resp;
+	int64 db_size;
+	XLogRecPtr request_lsn;
+	bool		latest;
+
+	request_lsn = zenith_get_request_lsn(&latest);
+	{
+		ZenithDbSizeRequest request = {
+			.req.tag = T_ZenithDbSizeRequest,
+			.req.latest = latest,
+			.req.lsn = request_lsn,
+			.dbNode = dbNode,
+		};
+
+		resp = page_server->request((ZenithRequest *) &request);
+	}
+
+	switch (resp->tag)
+	{
+		case T_ZenithDbSizeResponse:
+			db_size = ((ZenithDbSizeResponse *) resp)->db_size;
+			break;
+
+		case T_ZenithErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not read db size of db %u from page server at lsn %X/%08X",
+							dbNode,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+					 errdetail("page server returned error: %s",
+							   ((ZenithErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+
+	elog(SmgrTrace, "zenith_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+		 dbNode,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 db_size);
+
+	pfree(resp);
+	return db_size;
+}
+
 /*
  *	zenith_truncate() -- Truncate relation to specified number of blocks.
  */
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 33474e01941..9f4edbc60d8 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -77,6 +77,8 @@ db_dir_size(const char *path)
 	return dirsize;
 }
 
+dbsize_hook_type dbsize_hook = NULL;
+
 /*
  * calculate size of database in all tablespaces
  */
@@ -106,6 +108,13 @@ calculate_database_size(Oid dbOid)
 
 	/* Include pg_default storage */
 	snprintf(pathname, sizeof(pathname), "base/%u", dbOid);
+
+	if (dbsize_hook)
+	{
+		totalsize = (*dbsize_hook)(dbOid);
+		return totalsize;
+	}
+
 	totalsize = db_dir_size(pathname);
 
 	/* Scan the non-default tablespaces */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index c08eaed6179..4a0d6b2e09b 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -131,6 +131,9 @@ extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook;
 extern void smgr_init_standard(void);
 extern void smgr_shutdown_standard(void);
 
+// Alternative implementation of calculate_database_size()
+typedef const int64 (*dbsize_hook_type) (Oid dbOid);
+extern PGDLLIMPORT dbsize_hook_type dbsize_hook;
 
 typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode);
 extern PGDLLIMPORT smgr_hook_type smgr_hook;

From 9a9459a7f9cbcaa0e35ff1f2f34c419238fdec7e Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 6 May 2022 12:58:40 +0400
Subject: [PATCH 147/167] Shut down instance on basebackup LSN mismatch.

To force making basebackup again.
---
 src/backend/replication/walproposer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index a1b179d4be3..5d167ed3f9f 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -1270,7 +1270,7 @@ DetermineEpochStartLsn(void)
 			if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
 											walprop_shared->mineLastElectedTerm)))
 			{
-				elog(FATAL,
+				elog(PANIC,
 					 "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
 					 LSN_FORMAT_ARGS(propEpochStartLsn),
 					 LSN_FORMAT_ARGS(GetRedoStartLsn()));

From d62ec22effeca7b5794ab2c15a3fd9ee5a4a5b99 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 9 May 2022 22:08:10 +0300
Subject: [PATCH 148/167] Use compute-tools from the new neondatabase Docker
 Hub repo

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 496228cabcd..b9dade0d7a8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 #
 # Image with pre-built tools
 #
-FROM zenithdb/compute-tools:latest AS compute-deps
+FROM neondatabase/compute-tools:latest AS compute-deps
 # Only to get ready zenith_ctl binary as deppendency
 
 #

From 5a595fdcceca7f9a43398a8ed062896642cfa1c7 Mon Sep 17 00:00:00 2001
From: Egor Suvorov <egor@neon.tech>
Date: Tue, 10 May 2022 18:50:54 +0300
Subject: [PATCH 149/167] zenith_test_utils extension: add neon_xlogflush()

This function is to simplify complex WAL generation in https://github.com/neondatabase/neon/pull/1574

`pg_logical_emit_message` is the easiest way to get a big WAL record, but:
* If it's transactional, it gets `COMMIT` record right after
* If it's not, WAL is not flushed at all. The function helps here, so we
  don't rely on the background WAL writer.

I suspect the plain `xlogflush()` name may collide in the future, hence the prefix.
---
 .../zenith_test_utils/zenith_test_utils--1.0.sql    |  5 +++++
 contrib/zenith_test_utils/zenithtest.c              | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
index d595b043abf..adc821bcc13 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
@@ -22,3 +22,8 @@ CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum
 RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'neon_xlogflush'
+LANGUAGE C PARALLEL UNSAFE;
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/zenith_test_utils/zenithtest.c
index c1e2c1c92f4..d3616d633ed 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/zenith_test_utils/zenithtest.c
@@ -20,6 +20,7 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "utils/builtins.h"
+#include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
 #include "zenith/pagestore_client.h"
@@ -32,6 +33,7 @@ PG_FUNCTION_INFO_V1(test_consume_xids);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
+PG_FUNCTION_INFO_V1(neon_xlogflush);
 
 /*
  * Linkage to functions in zenith module.
@@ -289,3 +291,14 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
+
+/*
+ * Directly calls XLogFlush(lsn) to flush WAL buffers.
+ */
+Datum
+neon_xlogflush(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr lsn = PG_GETARG_LSN(0);
+	XLogFlush(lsn);
+	PG_RETURN_VOID();
+}

From 1db115cecb3dbc2a74c5efa964fdf3a8a341c4d2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 13 May 2022 18:29:32 +0300
Subject: [PATCH 150/167] Reduce noise in the logs from inmem_write()

I'm seeing a lot of these warnings from B-tree SPLIT records:

    WARNING:  inmem_write() called for 1663/12990/16397.0 blk 2630: used_pages 0
    CONTEXT:  WAL redo at 1/235A1B50 for Btree/SPLIT_R: level 0, firstrightoff 368, newitemoff 408, postingoff 0

That seems OK, replaying a split record legitimately accesses many buffers:
the left half, the right half, left sibling, right sibling, and child.

We could bump up 'temp_buffers' (currently 4), but I didn't do that
beceause it's also good to get some test coverage for the
inmem_smgr.c.
---
 contrib/zenith/inmem_smgr.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/contrib/zenith/inmem_smgr.c b/contrib/zenith/inmem_smgr.c
index 1d8aa9ac2ee..4eff64bf370 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/zenith/inmem_smgr.c
@@ -28,7 +28,11 @@
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 
-#define MAX_PAGES 32
+/* Size of the in-memory smgr */
+#define MAX_PAGES 64
+
+/* If more than WARN_PAGES are used, print a warning in the log */
+#define WARN_PAGES 32
 
 static BufferTag page_tag[MAX_PAGES];
 static char page_body[MAX_PAGES][BLCKSZ];
@@ -177,7 +181,15 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	pg = locate_page(reln, forknum, blocknum);
 	if (pg < 0)
 	{
-		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
+		/*
+		 * We assume the buffer cache is large enough to hold all the buffers
+		 * needed for most operations. Overflowing to this "in-mem smgr" in rare
+		 * cases is OK. But if we find that we're using more than WARN_PAGES,
+		 * print a warning so that we get alerted and get to investigate why
+		 * we're accessing so many buffers.
+		 */
+		elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
+			 "inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,
@@ -191,7 +203,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		used_pages++;
 		INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
 	}  else {
-		elog(WARNING, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
+		elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
 			 reln->smgr_rnode.node.spcNode,
 			 reln->smgr_rnode.node.dbNode,
 			 reln->smgr_rnode.node.relNode,

From 79af2faf08d9bec1b1664a72936727dcca36d253 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 18 May 2022 12:14:28 +0400
Subject: [PATCH 151/167] Use compute_ctl instead of zenith_ctl (#162)

---
 Dockerfile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b9dade0d7a8..7f4710d3a5b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 # Image with pre-built tools
 #
 FROM neondatabase/compute-tools:latest AS compute-deps
-# Only to get ready zenith_ctl binary as deppendency
+# Only to get ready compute_ctl binary as deppendency
 
 #
 # Image with Postgres build deps
@@ -56,11 +56,14 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
 
 # Copy binaries from compute-tools
-COPY --from=compute-deps /usr/local/bin/zenith_ctl /usr/local/bin/zenith_ctl
+COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
+
+# XXX: temporary symlink for compatibility with old control-plane
+RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 # Add postgres shared objects to the search path
 RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
 USER postgres
 
-ENTRYPOINT ["/usr/local/bin/zenith_ctl"]
+ENTRYPOINT ["/usr/local/bin/compute_ctl"]

From 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 24 May 2022 19:44:58 +0300
Subject: [PATCH 152/167] Improve error messages on seccomp loading errors.

At https://github.com/neondatabase/neon/pull/1783#issuecomment-1136144433,
Kirill saw case where the WAL redo process failed to open /dev/null.
That's pretty weird, and I have no idea what might be causing it, but
with this patch we'll at least get a little more details if it happens
again. This will print the OS error (with %m) if it happens, and also
distinguishes between the two error cases that previously both emitted
the 'failed to open a test file' error.
---
 src/backend/postmaster/seccomp.c | 39 ++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/backend/postmaster/seccomp.c b/src/backend/postmaster/seccomp.c
index 03971a072cf..3ac21b02983 100644
--- a/src/backend/postmaster/seccomp.c
+++ b/src/backend/postmaster/seccomp.c
@@ -99,9 +99,6 @@ static int do_seccomp_load_rules(PgSeccompRule *rules, int count, uint32 def_act
 
 void seccomp_load_rules(PgSeccompRule *rules, int count)
 {
-#define raise_error(str) \
-	ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: " str)))
-
 	struct sigaction action = { .sa_flags = SA_SIGINFO };
 	PgSeccompRule rule;
 	long fd;
@@ -113,37 +110,51 @@ void seccomp_load_rules(PgSeccompRule *rules, int count)
 	 */
 	action.sa_sigaction = seccomp_test_sighandler;
 	if (sigaction(SIGSYS, &action, NULL) != 0)
-		raise_error("failed to install a test SIGSYS handler");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not install test SIGSYS handler")));
 
 	/*
 	 * First, check that open of a well-known file works.
 	 * XXX: We use raw syscall() to call the very open().
 	 */
 	fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
-	if (fd < 0 || seccomp_test_sighandler_done)
-		raise_error("failed to open a test file");
-	close((int)fd);
+	if (seccomp_test_sighandler_done)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: signal handler test flag was set unexpectedly")));
+	if (fd < 0)
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not open /dev/null for seccomp testing: %m")));
+	close((int) fd);
 
 	/* Set a trap on open() to test seccomp bpf */
 	rule = PG_SCMP(open, SCMP_ACT_TRAP);
 	if (do_seccomp_load_rules(&rule, 1, SCMP_ACT_ALLOW) != 0)
-		raise_error("failed to load a test filter");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not load test trap")));
 
 	/* Finally, check that open() now raises SIGSYS */
-	(void)syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
+	(void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0);
 	if (!seccomp_test_sighandler_done)
-		raise_error("SIGSYS handler doesn't seem to work");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: SIGSYS handler doesn't seem to work")));
 
 	/* Now that everything seems to work, install a proper handler */
 	action.sa_sigaction = seccomp_deny_sighandler;
 	if (sigaction(SIGSYS, &action, NULL) != 0)
-		raise_error("failed to install a proper SIGSYS handler");
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not install SIGSYS handler")));
 
 	/* If this succeeds, any syscall not in the list will crash the process */
 	if (do_seccomp_load_rules(rules, count, SCMP_ACT_TRAP) != 0)
-		raise_error("failed to enter seccomp mode");
-
-#undef raise_error
+		ereport(FATAL,
+				(errcode(ERRCODE_SYSTEM_ERROR),
+				 errmsg("seccomp: could not enter seccomp mode")));
 }
 
 /*

From bb306d907efe39dad4aa79c5100de62f2f91d0ca Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 26 May 2022 21:16:21 +0300
Subject: [PATCH 153/167] Rename contrib/zenith to contrib/neon. Rename custom
 GUCs: - zenith.page_server_connstring -> neon.pageserver_connstring -
 zenith.zenith_tenant -> neon.tenant_id - zenith.zenith_timeline ->
 neon.timeline_id - zenith.max_cluster_size -> neon.max_cluster_size

---
 Dockerfile                                    |  2 +-
 contrib/{zenith => neon}/Makefile             | 14 +++++------
 contrib/{zenith => neon}/inmem_smgr.c         |  3 ++-
 contrib/{zenith => neon}/libpagestore.c       | 18 ++++++-------
 .../zenith--1.0.sql => neon/neon--1.0.sql}    |  2 +-
 contrib/{zenith/zenith.c => neon/neon.c}      |  6 ++---
 .../zenith.control => neon/neon.control}      |  4 +--
 contrib/{zenith => neon}/pagestore_client.h   |  2 +-
 contrib/{zenith => neon}/pagestore_smgr.c     |  6 ++---
 contrib/{zenith => neon}/relsize_cache.c      | 12 ++++-----
 contrib/neon_test_utils/Makefile              | 25 +++++++++++++++++++
 .../neon_test_utils--1.0.sql}                 |  2 +-
 .../neon_test_utils/neon_test_utils.control   |  5 ++++
 .../neontest.c}                               | 10 ++++----
 contrib/zenith_test_utils/Makefile            | 25 -------------------
 .../zenith_test_utils.control                 |  5 ----
 src/backend/replication/walproposer.c         | 10 ++++----
 src/backend/tcop/zenith_wal_redo.c            |  2 --
 src/backend/utils/misc/guc.c                  |  2 +-
 19 files changed, 77 insertions(+), 78 deletions(-)
 rename contrib/{zenith => neon}/Makefile (58%)
 rename contrib/{zenith => neon}/inmem_smgr.c (99%)
 rename contrib/{zenith => neon}/libpagestore.c (96%)
 rename contrib/{zenith/zenith--1.0.sql => neon/neon--1.0.sql} (85%)
 rename contrib/{zenith/zenith.c => neon/neon.c} (93%)
 rename contrib/{zenith/zenith.control => neon/neon.control} (54%)
 rename contrib/{zenith => neon}/pagestore_client.h (99%)
 rename contrib/{zenith => neon}/pagestore_smgr.c (99%)
 rename contrib/{zenith => neon}/relsize_cache.c (91%)
 create mode 100644 contrib/neon_test_utils/Makefile
 rename contrib/{zenith_test_utils/zenith_test_utils--1.0.sql => neon_test_utils/neon_test_utils--1.0.sql} (92%)
 create mode 100644 contrib/neon_test_utils/neon_test_utils.control
 rename contrib/{zenith_test_utils/zenithtest.c => neon_test_utils/neontest.c} (97%)
 delete mode 100644 contrib/zenith_test_utils/Makefile
 delete mode 100644 contrib/zenith_test_utils/zenith_test_utils.control

diff --git a/Dockerfile b/Dockerfile
index 7f4710d3a5b..db472efd5e9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,7 @@ RUN mkdir /pg/compute_build && cd /pg/compute_build && \
     # Install main binaries and contribs
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/zenith install && \
+    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/neon install && \
     # Install headers
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
 
diff --git a/contrib/zenith/Makefile b/contrib/neon/Makefile
similarity index 58%
rename from contrib/zenith/Makefile
rename to contrib/neon/Makefile
index a4a60d7b88c..b6f3cf400ff 100644
--- a/contrib/zenith/Makefile
+++ b/contrib/neon/Makefile
@@ -1,17 +1,17 @@
-# contrib/zenith/Makefile
+# contrib/neon/Makefile
 
 
-MODULE_big = zenith
+MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o zenith.o
+	inmem_smgr.o libpagestore.o pagestore_smgr.o relsize_cache.o neon.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 
-EXTENSION = zenith
-DATA = zenith--1.0.sql
-PGFILEDESC = "zenith - cloud storage for PostgreSQL"
+EXTENSION = neon
+DATA = neon--1.0.sql
+PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
@@ -19,7 +19,7 @@ PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
 SHLIB_PREREQS = submake-libpq
-subdir = contrib/zenith
+subdir = contrib/neon
 top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
diff --git a/contrib/zenith/inmem_smgr.c b/contrib/neon/inmem_smgr.c
similarity index 99%
rename from contrib/zenith/inmem_smgr.c
rename to contrib/neon/inmem_smgr.c
index 4eff64bf370..7840292b08c 100644
--- a/contrib/zenith/inmem_smgr.c
+++ b/contrib/neon/inmem_smgr.c
@@ -16,7 +16,8 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  contrib/zenith/inmem_smgr.c
+ *	  contrib/neon/inmem_smgr.c
+ *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
diff --git a/contrib/zenith/libpagestore.c b/contrib/neon/libpagestore.c
similarity index 96%
rename from contrib/zenith/libpagestore.c
rename to contrib/neon/libpagestore.c
index 600e3d791c6..9d632527f44 100644
--- a/contrib/zenith/libpagestore.c
+++ b/contrib/neon/libpagestore.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	 contrib/zenith/libpqpagestore.c
+ *	 contrib/neon/libpqpagestore.c
  *
  *-------------------------------------------------------------------------
  */
@@ -374,7 +374,7 @@ substitute_pageserver_password(const char *page_server_connstring_raw)
 void
 _PG_init(void)
 {
-	DefineCustomStringVariable("zenith.page_server_connstring",
+	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
 							   &page_server_connstring_raw,
@@ -383,7 +383,7 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);
 
-	DefineCustomStringVariable("zenith.callmemaybe_connstring",
+	DefineCustomStringVariable("neon.callmemaybe_connstring",
 							   "Connection string that Page Server or WAL safekeeper should use to connect to us",
 							   NULL,
 							   &callmemaybe_connstring,
@@ -392,7 +392,7 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);
 
-	DefineCustomStringVariable("zenith.zenith_timeline",
+	DefineCustomStringVariable("neon.timeline_id",
 							   "Zenith timelineid the server is running on",
 							   NULL,
 							   &zenith_timeline,
@@ -401,8 +401,8 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   check_zenith_id, NULL, NULL);
 
-	DefineCustomStringVariable("zenith.zenith_tenant",
-							   "Zenith tenantid the server is running on",
+	DefineCustomStringVariable("neon.tenant_id",
+							   "Neon tenantid the server is running on",
 							   NULL,
 							   &zenith_tenant,
 							   "",
@@ -410,7 +410,7 @@ _PG_init(void)
 							   0,	/* no flags required */
 							   check_zenith_id, NULL, NULL);
 
-	DefineCustomBoolVariable("zenith.wal_redo",
+	DefineCustomBoolVariable("neon.wal_redo",
 							 "start in wal-redo mode",
 							 NULL,
 							 &wal_redo,
@@ -419,7 +419,7 @@ _PG_init(void)
 							 0,
 							 NULL, NULL, NULL);
 
-	DefineCustomIntVariable("zenith.max_cluster_size",
+	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
 							&max_cluster_size,
@@ -429,7 +429,7 @@ _PG_init(void)
 							NULL, NULL,	NULL);
 
 	relsize_hash_init();
-	EmitWarningsOnPlaceholders("zenith");
+	EmitWarningsOnPlaceholders("neon");
 
 	if (page_server != NULL)
 		zenith_log(ERROR, "libpqpagestore already loaded");
diff --git a/contrib/zenith/zenith--1.0.sql b/contrib/neon/neon--1.0.sql
similarity index 85%
rename from contrib/zenith/zenith--1.0.sql
rename to contrib/neon/neon--1.0.sql
index e414be8ceea..34f1ba78d4f 100644
--- a/contrib/zenith/zenith--1.0.sql
+++ b/contrib/neon/neon--1.0.sql
@@ -1,4 +1,4 @@
-\echo Use "CREATE EXTENSION zenith" to load this file. \quit
+\echo Use "CREATE EXTENSION neon" to load this file. \quit
 
 CREATE FUNCTION pg_cluster_size()
 RETURNS bigint
diff --git a/contrib/zenith/zenith.c b/contrib/neon/neon.c
similarity index 93%
rename from contrib/zenith/zenith.c
rename to contrib/neon/neon.c
index e88984d918c..f6bf6f40d26 100644
--- a/contrib/zenith/zenith.c
+++ b/contrib/neon/neon.c
@@ -1,10 +1,10 @@
 /*-------------------------------------------------------------------------
  *
- * zenith.c
- *	  Utility functions to expose zenith specific information to user
+ * neon.c
+ *	  Utility functions to expose neon specific information to user
  *
  * IDENTIFICATION
- *	 contrib/zenith/zenith.c
+ *	 contrib/neon/neon.c
  *
  *-------------------------------------------------------------------------
  */
diff --git a/contrib/zenith/zenith.control b/contrib/neon/neon.control
similarity index 54%
rename from contrib/zenith/zenith.control
rename to contrib/neon/neon.control
index 9aa5e2f067a..84f79881c1e 100644
--- a/contrib/zenith/zenith.control
+++ b/contrib/neon/neon.control
@@ -1,4 +1,4 @@
-# zenith extension
+# neon extension
 comment = 'cloud storage for PostgreSQL'
 default_version = '1.0'
-module_pathname = '$libdir/zenith'
+module_pathname = '$libdir/neon'
diff --git a/contrib/zenith/pagestore_client.h b/contrib/neon/pagestore_client.h
similarity index 99%
rename from contrib/zenith/pagestore_client.h
rename to contrib/neon/pagestore_client.h
index 051dc6bc9a1..eedc0864e90 100644
--- a/contrib/zenith/pagestore_client.h
+++ b/contrib/neon/pagestore_client.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * contrib/zenith/pagestore_client.h
+ * contrib/neon/pagestore_client.h
  *
  *-------------------------------------------------------------------------
  */
diff --git a/contrib/zenith/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
similarity index 99%
rename from contrib/zenith/pagestore_smgr.c
rename to contrib/neon/pagestore_smgr.c
index 544250bb55d..8086143ed46 100644
--- a/contrib/zenith/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -39,7 +39,7 @@
  *
  *
  * IDENTIFICATION
- *	  contrib/zenith/pagestore_smgr.c
+ *	  contrib/neon/pagestore_smgr.c
  *
  *-------------------------------------------------------------------------
  */
@@ -887,7 +887,7 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 				(errcode(ERRCODE_DISK_FULL),
 					errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
 						   max_cluster_size),
-					errhint("This limit is defined by zenith.max_cluster_size GUC")));
+					errhint("This limit is defined by neon.max_cluster_size GUC")));
 	}
 
 	zenith_wallog_page(reln, forkNum, blkno, buffer);
@@ -1005,7 +1005,7 @@ zenith_writeback(SMgrRelation reln, ForkNumber forknum,
 }
 
 /*
- * While function is defined in the zenith extension it's used within zenith_test_utils directly.
+ * While function is defined in the zenith extension it's used within neon_test_utils directly.
  * To avoid breaking tests in the runtime please keep function signature in sync.
  */
 void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
diff --git a/contrib/zenith/relsize_cache.c b/contrib/neon/relsize_cache.c
similarity index 91%
rename from contrib/zenith/relsize_cache.c
rename to contrib/neon/relsize_cache.c
index 993903b1b18..8dfcffe1d16 100644
--- a/contrib/zenith/relsize_cache.c
+++ b/contrib/neon/relsize_cache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  contrib/zenith/relsize_cache.c
+ *	  contrib/neon/relsize_cache.c
  *
  *-------------------------------------------------------------------------
  */
@@ -57,10 +57,10 @@ zenith_smgr_shmem_startup(void)
 		prev_shmem_startup_hook();
 
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-	relsize_lock = (LWLockId) GetNamedLWLockTranche("zenith_relsize");
+	relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
 	info.keysize = sizeof(RelTag);
 	info.entrysize = sizeof(RelSizeEntry);
-	relsize_hash = ShmemInitHash("zenith_relsize",
+	relsize_hash = ShmemInitHash("neon_relsize",
 								 relsize_hash_size, relsize_hash_size,
 								 &info,
 								 HASH_ELEM | HASH_BLOBS);
@@ -145,8 +145,8 @@ forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
 void
 relsize_hash_init(void)
 {
-	DefineCustomIntVariable("zenith.relsize_hash_size",
-							"Sets the maximum number of cached relation sizes for zenith",
+	DefineCustomIntVariable("neon.relsize_hash_size",
+							"Sets the maximum number of cached relation sizes for neon",
 							NULL,
 							&relsize_hash_size,
 							DEFAULT_RELSIZE_HASH_SIZE,
@@ -159,7 +159,7 @@ relsize_hash_init(void)
 	if (relsize_hash_size > 0)
 	{
 		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
-		RequestNamedLWLockTranche("zenith_relsize", 1);
+		RequestNamedLWLockTranche("neon_relsize", 1);
 
 		prev_shmem_startup_hook = shmem_startup_hook;
 		shmem_startup_hook = zenith_smgr_shmem_startup;
diff --git a/contrib/neon_test_utils/Makefile b/contrib/neon_test_utils/Makefile
new file mode 100644
index 00000000000..bd618e6d96e
--- /dev/null
+++ b/contrib/neon_test_utils/Makefile
@@ -0,0 +1,25 @@
+# contrib/neon_test_utils/Makefile
+
+
+MODULE_big = neon_test_utils
+OBJS = \
+	$(WIN32RES) \
+	neontest.o
+
+EXTENSION = neon_test_utils
+DATA = neon_test_utils--1.0.sql
+PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
+
+EXTRA_INSTALL=contrib/neon
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+PG_CPPFLAGS = -I$(top_srcdir)/contrib
+subdir = contrib/neon_test_utils
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql b/contrib/neon_test_utils/neon_test_utils--1.0.sql
similarity index 92%
rename from contrib/zenith_test_utils/zenith_test_utils--1.0.sql
rename to contrib/neon_test_utils/neon_test_utils--1.0.sql
index adc821bcc13..402981a9a66 100644
--- a/contrib/zenith_test_utils/zenith_test_utils--1.0.sql
+++ b/contrib/neon_test_utils/neon_test_utils--1.0.sql
@@ -1,5 +1,5 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION zenith_test_utils" to load this file. \quit
+\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit
 
 CREATE FUNCTION test_consume_xids(nxids int)
 RETURNS VOID
diff --git a/contrib/neon_test_utils/neon_test_utils.control b/contrib/neon_test_utils/neon_test_utils.control
new file mode 100644
index 00000000000..94e67205039
--- /dev/null
+++ b/contrib/neon_test_utils/neon_test_utils.control
@@ -0,0 +1,5 @@
+# neon_test_utils extension
+comment = 'helpers for neon testing and debugging'
+default_version = '1.0'
+module_pathname = '$libdir/neon_test_utils'
+relocatable = true
diff --git a/contrib/zenith_test_utils/zenithtest.c b/contrib/neon_test_utils/neontest.c
similarity index 97%
rename from contrib/zenith_test_utils/zenithtest.c
rename to contrib/neon_test_utils/neontest.c
index d3616d633ed..a3e730efe27 100644
--- a/contrib/zenith_test_utils/zenithtest.c
+++ b/contrib/neon_test_utils/neontest.c
@@ -1,10 +1,10 @@
 /*-------------------------------------------------------------------------
  *
- * zenithtest.c
- *	  Helpers for zenith testing and debugging
+ * neontest.c
+ *	  Helpers for neon testing and debugging
  *
  * IDENTIFICATION
- *	 contrib/zenith_test_utils/zenithtest.c
+ *	 contrib/neon_test_utils/neontest.c
  *
  *-------------------------------------------------------------------------
  */
@@ -23,7 +23,7 @@
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
-#include "zenith/pagestore_client.h"
+#include "neon/pagestore_client.h"
 
 PG_MODULE_MAGIC;
 
@@ -53,7 +53,7 @@ _PG_init(void)
 	/* Asserts verify that typedefs above match original declarations */
 	AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
 	zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
-		load_external_function("$libdir/zenith", "zenith_read_at_lsn",
+		load_external_function("$libdir/neon", "zenith_read_at_lsn",
 							   true, NULL);
 }
 
diff --git a/contrib/zenith_test_utils/Makefile b/contrib/zenith_test_utils/Makefile
deleted file mode 100644
index 5b2fcdc18fe..00000000000
--- a/contrib/zenith_test_utils/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-# contrib/zenith_test_utils/Makefile
-
-
-MODULE_big = zenith_test_utils
-OBJS = \
-	$(WIN32RES) \
-	zenithtest.o
-
-EXTENSION = zenith_test_utils
-DATA = zenith_test_utils--1.0.sql
-PGFILEDESC = "zenith_test_utils - helpers for zenith testing and debugging"
-
-EXTRA_INSTALL=contrib/zenith
-
-ifdef USE_PGXS
-PG_CONFIG = pg_config
-PGXS := $(shell $(PG_CONFIG) --pgxs)
-include $(PGXS)
-else
-PG_CPPFLAGS = -I$(top_srcdir)/contrib
-subdir = contrib/zenith_test_utils
-top_builddir = ../..
-include $(top_builddir)/src/Makefile.global
-include $(top_srcdir)/contrib/contrib-global.mk
-endif
diff --git a/contrib/zenith_test_utils/zenith_test_utils.control b/contrib/zenith_test_utils/zenith_test_utils.control
deleted file mode 100644
index 9b947b63966..00000000000
--- a/contrib/zenith_test_utils/zenith_test_utils.control
+++ /dev/null
@@ -1,5 +0,0 @@
-# zenith_test_utils extension
-comment = 'helpers for zenith testing and debugging'
-default_version = '1.0'
-module_pathname = '$libdir/zenith_test_utils'
-relocatable = true
diff --git a/src/backend/replication/walproposer.c b/src/backend/replication/walproposer.c
index 5d167ed3f9f..917ab5d294b 100644
--- a/src/backend/replication/walproposer.c
+++ b/src/backend/replication/walproposer.c
@@ -388,7 +388,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
-	load_file("zenith", false);
+	load_file("neon", false);
 
 	for (host = wal_acceptors_list; host != NULL && *host != '\0'; host = sep)
 	{
@@ -437,15 +437,15 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
 	pg_strong_random(&greetRequest.proposerId, sizeof(greetRequest.proposerId));
 	greetRequest.systemId = systemId;
 	if (!zenith_timeline_walproposer)
-		elog(FATAL, "zenith.zenith_timeline is not provided");
+		elog(FATAL, "neon.timeline_id is not provided");
 	if (*zenith_timeline_walproposer != '\0' &&
 		!HexDecodeString(greetRequest.ztimelineid, zenith_timeline_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_timeline, %s", zenith_timeline_walproposer);
+		elog(FATAL, "Could not parse neon.timeline_id, %s", zenith_timeline_walproposer);
 	if (!zenith_tenant_walproposer)
-		elog(FATAL, "zenith.zenith_tenant is not provided");
+		elog(FATAL, "neon.tenant_id is not provided");
 	if (*zenith_tenant_walproposer != '\0' &&
 		!HexDecodeString(greetRequest.ztenantid, zenith_tenant_walproposer, 16))
-		elog(FATAL, "Could not parse zenith.zenith_tenant, %s", zenith_tenant_walproposer);
+		elog(FATAL, "Could not parse neon.tenant_id, %s", zenith_tenant_walproposer);
 
 	greetRequest.timeline = ThisTimeLineID;
 	greetRequest.walSegSize = wal_segment_size;
diff --git a/src/backend/tcop/zenith_wal_redo.c b/src/backend/tcop/zenith_wal_redo.c
index 68f29564328..96e1058c406 100644
--- a/src/backend/tcop/zenith_wal_redo.c
+++ b/src/backend/tcop/zenith_wal_redo.c
@@ -10,8 +10,6 @@
  * processes. Instead, we wait for command from 'stdin', and respond to
  * 'stdout'.
  *
- * There's a TAP test for this in contrib/zenith_store/t/002_wal_redo_helper.pl
- *
  * The protocol through stdin/stdout is loosely based on the libpq protocol.
  * The process accepts messages through stdin, and each message has the format:
  *
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index bbc1ec8e181..9fbc5b0a19a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2114,7 +2114,7 @@ static struct config_bool ConfigureNamesBool[] =
 	},
 
 	{
-		{"zenith_test_evict", PGC_POSTMASTER, UNGROUPED,
+		{"neon_test_evict", PGC_POSTMASTER, UNGROUPED,
 			gettext_noop("Evict unpinned pages (for better test coverage)"),
 		},
 		&zenith_test_evict,

From a424e3ccff7d6af97d9ee5d4b727fb8324c78e11 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <lubennikovaav@gmail.com>
Date: Tue, 19 Apr 2022 15:36:25 +0300
Subject: [PATCH 154/167] Rename 'wal_acceptors' GUC to 'safekeepers'

---
 src/backend/utils/misc/guc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9fbc5b0a19a..ed369c0be75 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4660,8 +4660,8 @@ static struct config_string ConfigureNamesString[] =
 	},
 
 	{
-		{"wal_acceptors", PGC_POSTMASTER, UNGROUPED,
-			gettext_noop("List of Zenith WAL acceptors (host:port)"),
+		{"safekeepers", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("List of Neon WAL acceptors (host:port)"),
 			NULL,
 			GUC_LIST_INPUT | GUC_LIST_QUOTE
 		},

From 64bed0420ca3f54e77cf2a87b6f750400244a2d8 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 27 May 2022 16:04:10 +0300
Subject: [PATCH 155/167] Cache last written LSN for last  updated relations to
 reduce wait LSN time for queries to other relations

---
 contrib/neon/pagestore_smgr.c         | 16 +++---
 src/backend/access/gin/gininsert.c    |  2 +-
 src/backend/access/gist/gistbuild.c   |  4 +-
 src/backend/access/spgist/spginsert.c |  2 +-
 src/backend/access/transam/xlog.c     | 72 +++++++++++++++++++++++++--
 src/backend/commands/dbcommands.c     |  4 +-
 src/include/access/xlog.h             |  4 +-
 7 files changed, 83 insertions(+), 21 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 8086143ed46..a2a90a9b2ca 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -559,7 +559,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenPageLSN(lsn);
+	SetLastWrittenPageLSN(lsn, &reln->smgr_rnode.node);
 }
 
 
@@ -604,7 +604,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool *latest)
+zenith_get_request_lsn(bool *latest, RelFileNode *rnode)
 {
 	XLogRecPtr	lsn;
 
@@ -631,7 +631,7 @@ zenith_get_request_lsn(bool *latest)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN();
+		lsn = GetLastWrittenPageLSN(rnode);
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
@@ -717,7 +717,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node);
 	{
 		ZenithExistsRequest request = {
 			.req.tag = T_ZenithExistsRequest,
@@ -1080,7 +1080,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node);
 	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1285,7 +1285,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node);
 	{
 		ZenithNblocksRequest request = {
 			.req.tag = T_ZenithNblocksRequest,
@@ -1345,7 +1345,7 @@ zenith_dbsize(Oid dbNode)
 	XLogRecPtr request_lsn;
 	bool		latest;
 
-	request_lsn = zenith_get_request_lsn(&latest);
+	request_lsn = zenith_get_request_lsn(&latest, NULL);
 	{
 		ZenithDbSizeRequest request = {
 			.req.tag = T_ZenithDbSizeRequest,
@@ -1432,7 +1432,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	SetLastWrittenPageLSN(lsn);
+	SetLastWrittenPageLSN(lsn, &reln->smgr_rnode.node);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index dfad28d1f61..f5de1d5ab9b 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -422,7 +422,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd);
+	SetLastWrittenPageLSN(XactLastRecEnd, &index->rd_smgr->smgr_rnode.node);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index aef96c91da0..1c363a73b93 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -336,7 +336,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd);
+		SetLastWrittenPageLSN(XactLastRecEnd, &index->rd_smgr->smgr_rnode.node);
 
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
@@ -469,7 +469,7 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn);
+		SetLastWrittenPageLSN(lsn, &state->indexrel->rd_smgr->smgr_rnode.node);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index d85dd54e4df..cf6fe036283 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -144,7 +144,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd);
+	SetLastWrittenPageLSN(XactLastRecEnd, &index->rd_smgr->smgr_rnode.node);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fc61ef9c084..b378d3d1e62 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -87,6 +87,9 @@ extern uint32 bootstrap_data_checksum_version;
 #define RECOVERY_COMMAND_FILE	"recovery.conf"
 #define RECOVERY_COMMAND_DONE	"recovery.done"
 
+/* Size of last written page LSN cache. Should not be large because sequential search is used. */
+#define LAST_WRITTEN_CACHE_SIZE 4
+
 /* User-settable parameters */
 int			max_wal_size_mb = 1024; /* 1 GB */
 int			min_wal_size_mb = 80;	/* 80 MB */
@@ -748,7 +751,17 @@ typedef struct XLogCtlData
 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
+
+	/*
+	 * Cache of last written page LSN.
+	 * We store this value for up to LAST_WRITTEN_CACHE_SIZE relations + maximum for all other relations.
+	 */
 	XLogRecPtr  lastWrittenPageLSN;
+	struct {
+		RelFileNode rnode;
+		XLogRecPtr  lsn;
+	} lastWrittenPageCache[LAST_WRITTEN_CACHE_SIZE];
+	size_t lastWrittenPageCacheClock; /* Pointer of the victim element for clock replacement algorithm */
 
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
 	XLogRecPtr	RedoStartLSN;
@@ -8089,7 +8102,8 @@ StartupXLOG(void)
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
 	XLogCtl->lastWrittenPageLSN = EndOfLog;
-
+	memset(XLogCtl->lastWrittenPageCache, 0, sizeof XLogCtl->lastWrittenPageCache);
+	XLogCtl->lastWrittenPageCacheClock = 0;
 	LocalSetXLogInsertAllowed();
 
 	/* If necessary, write overwrite-contrecord before doing anything else */
@@ -8814,11 +8828,31 @@ GetInsertRecPtr(void)
  * GetLastWrittenPageLSN -- Returns maximal LSN of written page
  */
 XLogRecPtr
-GetLastWrittenPageLSN(void)
+GetLastWrittenPageLSN(RelFileNode *rnode)
 {
 	XLogRecPtr lsn;
 	SpinLockAcquire(&XLogCtl->info_lck);
 	lsn = XLogCtl->lastWrittenPageLSN;
+	if (rnode != NULL)
+	{
+		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
+		{
+			if (RelFileNodeEquals(*rnode, XLogCtl->lastWrittenPageCache[i].rnode))
+			{
+				lsn = XLogCtl->lastWrittenPageCache[i].lsn;
+				break;
+			}
+		}
+	}
+	else
+	{
+		/* Find maximum of all cached LSNs */
+		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
+		{
+			if (XLogCtl->lastWrittenPageCache[i].lsn > lsn)
+				lsn = XLogCtl->lastWrittenPageCache[i].lsn;
+		}
+	}
 	SpinLockRelease(&XLogCtl->info_lck);
 
 	return lsn;
@@ -8828,11 +8862,39 @@ GetLastWrittenPageLSN(void)
  * SetLastWrittenPageLSN -- Set maximal LSN of written page
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn)
+SetLastWrittenPageLSN(XLogRecPtr lsn, RelFileNode *rnode)
 {
 	SpinLockAcquire(&XLogCtl->info_lck);
-	if (lsn > XLogCtl->lastWrittenPageLSN)
-		XLogCtl->lastWrittenPageLSN = lsn;
+	if (rnode == NULL)
+	{
+		if (lsn > XLogCtl->lastWrittenPageLSN)
+			XLogCtl->lastWrittenPageLSN = lsn;
+	}
+	else
+	{
+		int i = LAST_WRITTEN_CACHE_SIZE;
+		while (--i >= 0)
+		{
+			if (RelFileNodeEquals(*rnode, XLogCtl->lastWrittenPageCache[i].rnode))
+			{
+				if (lsn > XLogCtl->lastWrittenPageCache[i].lsn)
+				{
+					XLogCtl->lastWrittenPageCache[i].lsn = lsn;
+				}
+				break;
+			}
+		}
+		if (i < 0)
+		{
+			int victim = ++XLogCtl->lastWrittenPageCacheClock % LAST_WRITTEN_CACHE_SIZE;
+			if (XLogCtl->lastWrittenPageCache[victim].lsn > XLogCtl->lastWrittenPageLSN)
+			{
+				XLogCtl->lastWrittenPageLSN = XLogCtl->lastWrittenPageCache[victim].lsn;
+			}
+			XLogCtl->lastWrittenPageCache[victim].rnode = *rnode;
+			XLogCtl->lastWrittenPageCache[victim].lsn = lsn;
+		}
+ 	}
 	SpinLockRelease(&XLogCtl->info_lck);
 }
 
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 509e482c355..987175c21b7 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -674,7 +674,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn);
+				SetLastWrittenPageLSN(lsn, NULL);
 			}
 		}
 		table_endscan(scan);
@@ -2225,7 +2225,7 @@ dbase_redo(XLogReaderState *record)
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
 
-			SetLastWrittenPageLSN(lsn);
+			SetLastWrittenPageLSN(lsn, NULL);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 66fe9dfcd9e..48f28edfeda 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -351,8 +351,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn);
-extern XLogRecPtr GetLastWrittenPageLSN(void);
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn, RelFileNode *rnode);
+extern XLogRecPtr GetLastWrittenPageLSN(RelFileNode *rnode);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From f96d40b4005845b0d01d86135d2b797ed6a1c85b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sat, 28 May 2022 17:20:31 +0300
Subject: [PATCH 156/167] Store only relfilenode in last written LSN cache

---
 contrib/neon/pagestore_smgr.c         |  6 +--
 src/backend/access/gin/gininsert.c    |  2 +-
 src/backend/access/gist/gistbuild.c   |  4 +-
 src/backend/access/spgist/spginsert.c |  2 +-
 src/backend/access/transam/xlog.c     | 54 ++++++++++++++-------------
 src/backend/commands/dbcommands.c     |  4 +-
 src/include/access/xlog.h             |  4 +-
 7 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index a2a90a9b2ca..466113c8524 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -559,7 +559,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenPageLSN(lsn, &reln->smgr_rnode.node);
+	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
 }
 
 
@@ -631,7 +631,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode *rnode)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN(rnode);
+		lsn = GetLastWrittenPageLSN(rnode->relNode);
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
@@ -1432,7 +1432,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	SetLastWrittenPageLSN(lsn, &reln->smgr_rnode.node);
+	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index f5de1d5ab9b..b603ab1a567 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -422,7 +422,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, &index->rd_smgr->smgr_rnode.node);
+	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 1c363a73b93..dc401767cb7 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -336,7 +336,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd, &index->rd_smgr->smgr_rnode.node);
+		SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
@@ -469,7 +469,7 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn, &state->indexrel->rd_smgr->smgr_rnode.node);
+		SetLastWrittenPageLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index cf6fe036283..502fd11ad0a 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -144,7 +144,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, &index->rd_smgr->smgr_rnode.node);
+	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b378d3d1e62..c9476a5d4dc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -756,12 +756,10 @@ typedef struct XLogCtlData
 	 * Cache of last written page LSN.
 	 * We store this value for up to LAST_WRITTEN_CACHE_SIZE relations + maximum for all other relations.
 	 */
-	XLogRecPtr  lastWrittenPageLSN;
-	struct {
-		RelFileNode rnode;
-		XLogRecPtr  lsn;
-	} lastWrittenPageCache[LAST_WRITTEN_CACHE_SIZE];
-	size_t lastWrittenPageCacheClock; /* Pointer of the victim element for clock replacement algorithm */
+	XLogRecPtr	lastWrittenPageLsn;
+	XLogRecPtr	lastWrittenPageCacheLsn[LAST_WRITTEN_CACHE_SIZE];
+	Oid			lastWrittenPageCacheOid[LAST_WRITTEN_CACHE_SIZE];
+	size_t		lastWrittenPageCacheClock; /* Pointer of the victim element for clock replacement algorithm */
 
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
 	XLogRecPtr	RedoStartLSN;
@@ -8101,8 +8099,12 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-	XLogCtl->lastWrittenPageLSN = EndOfLog;
-	memset(XLogCtl->lastWrittenPageCache, 0, sizeof XLogCtl->lastWrittenPageCache);
+	XLogCtl->lastWrittenPageLsn = EndOfLog;
+	for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
+	{
+		XLogCtl->lastWrittenPageCacheLsn[i] = InvalidXLogRecPtr;
+		XLogCtl->lastWrittenPageCacheOid[i] = InvalidOid;
+	}
 	XLogCtl->lastWrittenPageCacheClock = 0;
 	LocalSetXLogInsertAllowed();
 
@@ -8828,18 +8830,18 @@ GetInsertRecPtr(void)
  * GetLastWrittenPageLSN -- Returns maximal LSN of written page
  */
 XLogRecPtr
-GetLastWrittenPageLSN(RelFileNode *rnode)
+GetLastWrittenPageLSN(Oid rnode)
 {
 	XLogRecPtr lsn;
 	SpinLockAcquire(&XLogCtl->info_lck);
-	lsn = XLogCtl->lastWrittenPageLSN;
-	if (rnode != NULL)
+	lsn = XLogCtl->lastWrittenPageLsn;
+	if (rnode != InvalidOid)
 	{
 		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
 		{
-			if (RelFileNodeEquals(*rnode, XLogCtl->lastWrittenPageCache[i].rnode))
+			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
 			{
-				lsn = XLogCtl->lastWrittenPageCache[i].lsn;
+				lsn = XLogCtl->lastWrittenPageCacheLsn[i];
 				break;
 			}
 		}
@@ -8849,8 +8851,8 @@ GetLastWrittenPageLSN(RelFileNode *rnode)
 		/* Find maximum of all cached LSNs */
 		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
 		{
-			if (XLogCtl->lastWrittenPageCache[i].lsn > lsn)
-				lsn = XLogCtl->lastWrittenPageCache[i].lsn;
+			if (XLogCtl->lastWrittenPageCacheLsn[i] > lsn)
+				lsn = XLogCtl->lastWrittenPageCacheLsn[i];
 		}
 	}
 	SpinLockRelease(&XLogCtl->info_lck);
@@ -8862,24 +8864,24 @@ GetLastWrittenPageLSN(RelFileNode *rnode)
  * SetLastWrittenPageLSN -- Set maximal LSN of written page
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn, RelFileNode *rnode)
+SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
 {
 	SpinLockAcquire(&XLogCtl->info_lck);
-	if (rnode == NULL)
+	if (rnode == InvalidOid)
 	{
-		if (lsn > XLogCtl->lastWrittenPageLSN)
-			XLogCtl->lastWrittenPageLSN = lsn;
+		if (lsn > XLogCtl->lastWrittenPageLsn)
+			XLogCtl->lastWrittenPageLsn = lsn;
 	}
 	else
 	{
 		int i = LAST_WRITTEN_CACHE_SIZE;
 		while (--i >= 0)
 		{
-			if (RelFileNodeEquals(*rnode, XLogCtl->lastWrittenPageCache[i].rnode))
+			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
 			{
-				if (lsn > XLogCtl->lastWrittenPageCache[i].lsn)
+				if (lsn > XLogCtl->lastWrittenPageCacheLsn[i])
 				{
-					XLogCtl->lastWrittenPageCache[i].lsn = lsn;
+					XLogCtl->lastWrittenPageCacheLsn[i] = lsn;
 				}
 				break;
 			}
@@ -8887,12 +8889,12 @@ SetLastWrittenPageLSN(XLogRecPtr lsn, RelFileNode *rnode)
 		if (i < 0)
 		{
 			int victim = ++XLogCtl->lastWrittenPageCacheClock % LAST_WRITTEN_CACHE_SIZE;
-			if (XLogCtl->lastWrittenPageCache[victim].lsn > XLogCtl->lastWrittenPageLSN)
+			if (XLogCtl->lastWrittenPageCacheLsn[victim] > XLogCtl->lastWrittenPageLsn)
 			{
-				XLogCtl->lastWrittenPageLSN = XLogCtl->lastWrittenPageCache[victim].lsn;
+				XLogCtl->lastWrittenPageLsn = XLogCtl->lastWrittenPageCacheLsn[victim];
 			}
-			XLogCtl->lastWrittenPageCache[victim].rnode = *rnode;
-			XLogCtl->lastWrittenPageCache[victim].lsn = lsn;
+			XLogCtl->lastWrittenPageCacheOid[victim] = rnode;
+			XLogCtl->lastWrittenPageCacheLsn[victim] = lsn;
 		}
  	}
 	SpinLockRelease(&XLogCtl->info_lck);
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 987175c21b7..f562e433747 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -674,7 +674,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn, NULL);
+				SetLastWrittenPageLSN(lsn, InvalidOid);
 			}
 		}
 		table_endscan(scan);
@@ -2225,7 +2225,7 @@ dbase_redo(XLogReaderState *record)
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
 
-			SetLastWrittenPageLSN(lsn, NULL);
+			SetLastWrittenPageLSN(lsn, InvalidOid);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 48f28edfeda..ae0fd33a834 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -351,8 +351,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn, RelFileNode *rnode);
-extern XLogRecPtr GetLastWrittenPageLSN(RelFileNode *rnode);
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn, Oid relfilenode);
+extern XLogRecPtr GetLastWrittenPageLSN(Oid relfilenode);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From c9d8ec7b99b8baa46ca895303ba01e5430b8463b Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 31 May 2022 14:34:40 +0300
Subject: [PATCH 157/167] Add forknum as a part of the
 lastWrittenPageCacheOid[] key

---
 contrib/neon/pagestore_smgr.c         | 16 ++++++++--------
 src/backend/access/gin/gininsert.c    |  2 +-
 src/backend/access/gist/gistbuild.c   |  4 ++--
 src/backend/access/spgist/spginsert.c |  2 +-
 src/backend/access/transam/xlog.c     | 24 +++++++++++++++++-------
 src/backend/commands/dbcommands.c     |  4 ++--
 src/include/access/xlog.h             |  4 ++--
 7 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 466113c8524..c6e8b517c29 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -559,7 +559,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
+	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode, forknum);
 }
 
 
@@ -604,7 +604,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool *latest, RelFileNode *rnode)
+zenith_get_request_lsn(bool *latest, RelFileNode *rnode, ForkNumber forknum)
 {
 	XLogRecPtr	lsn;
 
@@ -631,7 +631,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode *rnode)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN(rnode->relNode);
+		lsn = GetLastWrittenPageLSN(rnode->relNode, forknum);
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
@@ -717,7 +717,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node);
+	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node, forkNum);
 	{
 		ZenithExistsRequest request = {
 			.req.tag = T_ZenithExistsRequest,
@@ -1080,7 +1080,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node);
+	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node, forkNum);
 	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1285,7 +1285,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node);
+	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node, forknum);
 	{
 		ZenithNblocksRequest request = {
 			.req.tag = T_ZenithNblocksRequest,
@@ -1345,7 +1345,7 @@ zenith_dbsize(Oid dbNode)
 	XLogRecPtr request_lsn;
 	bool		latest;
 
-	request_lsn = zenith_get_request_lsn(&latest, NULL);
+	request_lsn = zenith_get_request_lsn(&latest, NULL, InvalidForkNumber);
 	{
 		ZenithDbSizeRequest request = {
 			.req.tag = T_ZenithDbSizeRequest,
@@ -1432,7 +1432,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
+	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode, forknum);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index b603ab1a567..a316e2aa29d 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -422,7 +422,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
+	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index dc401767cb7..4dd9f3d8912 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -336,7 +336,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
+		SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
 
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
@@ -469,7 +469,7 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode);
+		SetLastWrittenPageLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 502fd11ad0a..b8c279b568a 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -144,7 +144,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
+	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c9476a5d4dc..d8784b6bc3d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -610,6 +610,11 @@ typedef struct XLogCtlInsert
 	WALInsertLockPadded *WALInsertLocks;
 } XLogCtlInsert;
 
+typedef struct RnodeForkKey {
+	Oid rnode;
+	ForkNumber forknum;
+} RnodeForkKey;
+
 /*
  * Total shared-memory state for XLOG.
  */
@@ -758,7 +763,7 @@ typedef struct XLogCtlData
 	 */
 	XLogRecPtr	lastWrittenPageLsn;
 	XLogRecPtr	lastWrittenPageCacheLsn[LAST_WRITTEN_CACHE_SIZE];
-	Oid			lastWrittenPageCacheOid[LAST_WRITTEN_CACHE_SIZE];
+	RnodeForkKey	lastWrittenPageCacheOid[LAST_WRITTEN_CACHE_SIZE];
 	size_t		lastWrittenPageCacheClock; /* Pointer of the victim element for clock replacement algorithm */
 
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
@@ -772,6 +777,7 @@ typedef struct XLogCtlData
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
+
 static XLogCtlData *XLogCtl = NULL;
 
 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
@@ -8103,7 +8109,8 @@ StartupXLOG(void)
 	for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
 	{
 		XLogCtl->lastWrittenPageCacheLsn[i] = InvalidXLogRecPtr;
-		XLogCtl->lastWrittenPageCacheOid[i] = InvalidOid;
+		struct RnodeForkKey key = {InvalidOid, InvalidForkNumber};
+		XLogCtl->lastWrittenPageCacheOid[i] = key;
 	}
 	XLogCtl->lastWrittenPageCacheClock = 0;
 	LocalSetXLogInsertAllowed();
@@ -8830,7 +8837,7 @@ GetInsertRecPtr(void)
  * GetLastWrittenPageLSN -- Returns maximal LSN of written page
  */
 XLogRecPtr
-GetLastWrittenPageLSN(Oid rnode)
+GetLastWrittenPageLSN(Oid rnode, ForkNumber forknum)
 {
 	XLogRecPtr lsn;
 	SpinLockAcquire(&XLogCtl->info_lck);
@@ -8839,7 +8846,8 @@ GetLastWrittenPageLSN(Oid rnode)
 	{
 		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
 		{
-			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
+			RnodeForkKey key = XLogCtl->lastWrittenPageCacheOid[i];
+			if (rnode == key.rnode &&  forknum == key.rnode)
 			{
 				lsn = XLogCtl->lastWrittenPageCacheLsn[i];
 				break;
@@ -8864,7 +8872,7 @@ GetLastWrittenPageLSN(Oid rnode)
  * SetLastWrittenPageLSN -- Set maximal LSN of written page
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
+SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode, ForkNumber forknum)
 {
 	SpinLockAcquire(&XLogCtl->info_lck);
 	if (rnode == InvalidOid)
@@ -8877,7 +8885,8 @@ SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
 		int i = LAST_WRITTEN_CACHE_SIZE;
 		while (--i >= 0)
 		{
-			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
+			RnodeForkKey key = XLogCtl->lastWrittenPageCacheOid[i];
+			if (rnode == key.rnode &&  forknum == key.rnode)
 			{
 				if (lsn > XLogCtl->lastWrittenPageCacheLsn[i])
 				{
@@ -8893,7 +8902,8 @@ SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
 			{
 				XLogCtl->lastWrittenPageLsn = XLogCtl->lastWrittenPageCacheLsn[victim];
 			}
-			XLogCtl->lastWrittenPageCacheOid[victim] = rnode;
+			struct RnodeForkKey key = {rnode, forknum};
+			XLogCtl->lastWrittenPageCacheOid[victim] = key;
 			XLogCtl->lastWrittenPageCacheLsn[victim] = lsn;
 		}
  	}
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index f562e433747..e60922b8fb9 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -674,7 +674,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn, InvalidOid);
+				SetLastWrittenPageLSN(lsn, InvalidOid, InvalidForkNumber);
 			}
 		}
 		table_endscan(scan);
@@ -2225,7 +2225,7 @@ dbase_redo(XLogReaderState *record)
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
 
-			SetLastWrittenPageLSN(lsn, InvalidOid);
+			SetLastWrittenPageLSN(lsn, InvalidOid, InvalidForkNumber);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ae0fd33a834..a8639e1a78d 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -351,8 +351,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn, Oid relfilenode);
-extern XLogRecPtr GetLastWrittenPageLSN(Oid relfilenode);
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn, Oid relfilenode, ForkNumber forknum);
+extern XLogRecPtr GetLastWrittenPageLSN(Oid relfilenode, ForkNumber forknum);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From beace5d88be29ea0074f29318f532691b03f73de Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 31 May 2022 15:25:57 +0300
Subject: [PATCH 158/167] Undo using of forknumber in last written page LSN
 cache

---
 contrib/neon/pagestore_smgr.c         |  6 +++---
 src/backend/access/gin/gininsert.c    |  2 +-
 src/backend/access/gist/gistbuild.c   |  4 ++--
 src/backend/access/spgist/spginsert.c |  2 +-
 src/backend/access/transam/xlog.c     | 27 ++++++++++-----------------
 src/backend/commands/dbcommands.c     |  4 ++--
 src/include/access/xlog.h             |  4 ++--
 7 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index c6e8b517c29..964d85e0205 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -559,7 +559,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode, forknum);
+	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
 }
 
 
@@ -631,7 +631,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode *rnode, ForkNumber forknum)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN(rnode->relNode, forknum);
+		lsn = GetLastWrittenPageLSN(rnode->relNode);
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
@@ -1432,7 +1432,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode, forknum);
+	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index a316e2aa29d..b603ab1a567 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -422,7 +422,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
+	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 4dd9f3d8912..dc401767cb7 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -336,7 +336,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
+		SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
@@ -469,7 +469,7 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
+		SetLastWrittenPageLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index b8c279b568a..502fd11ad0a 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -144,7 +144,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, MAIN_FORKNUM);
+	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
 
 	smgr_end_unlogged_build(index->rd_smgr);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d8784b6bc3d..3f8a018afa8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -610,11 +610,6 @@ typedef struct XLogCtlInsert
 	WALInsertLockPadded *WALInsertLocks;
 } XLogCtlInsert;
 
-typedef struct RnodeForkKey {
-	Oid rnode;
-	ForkNumber forknum;
-} RnodeForkKey;
-
 /*
  * Total shared-memory state for XLOG.
  */
@@ -763,7 +758,7 @@ typedef struct XLogCtlData
 	 */
 	XLogRecPtr	lastWrittenPageLsn;
 	XLogRecPtr	lastWrittenPageCacheLsn[LAST_WRITTEN_CACHE_SIZE];
-	RnodeForkKey	lastWrittenPageCacheOid[LAST_WRITTEN_CACHE_SIZE];
+	Oid			lastWrittenPageCacheOid[LAST_WRITTEN_CACHE_SIZE];
 	size_t		lastWrittenPageCacheClock; /* Pointer of the victim element for clock replacement algorithm */
 
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
@@ -8109,8 +8104,7 @@ StartupXLOG(void)
 	for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
 	{
 		XLogCtl->lastWrittenPageCacheLsn[i] = InvalidXLogRecPtr;
-		struct RnodeForkKey key = {InvalidOid, InvalidForkNumber};
-		XLogCtl->lastWrittenPageCacheOid[i] = key;
+		XLogCtl->lastWrittenPageCacheOid[i] = InvalidOid;
 	}
 	XLogCtl->lastWrittenPageCacheClock = 0;
 	LocalSetXLogInsertAllowed();
@@ -8837,7 +8831,7 @@ GetInsertRecPtr(void)
  * GetLastWrittenPageLSN -- Returns maximal LSN of written page
  */
 XLogRecPtr
-GetLastWrittenPageLSN(Oid rnode, ForkNumber forknum)
+GetLastWrittenPageLSN(Oid rnode)
 {
 	XLogRecPtr lsn;
 	SpinLockAcquire(&XLogCtl->info_lck);
@@ -8846,8 +8840,7 @@ GetLastWrittenPageLSN(Oid rnode, ForkNumber forknum)
 	{
 		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
 		{
-			RnodeForkKey key = XLogCtl->lastWrittenPageCacheOid[i];
-			if (rnode == key.rnode &&  forknum == key.rnode)
+			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
 			{
 				lsn = XLogCtl->lastWrittenPageCacheLsn[i];
 				break;
@@ -8872,8 +8865,11 @@ GetLastWrittenPageLSN(Oid rnode, ForkNumber forknum)
  * SetLastWrittenPageLSN -- Set maximal LSN of written page
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode, ForkNumber forknum)
+SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
 {
+	if (lsn == InvalidXLogRecPtr)
+		return;
+
 	SpinLockAcquire(&XLogCtl->info_lck);
 	if (rnode == InvalidOid)
 	{
@@ -8885,8 +8881,7 @@ SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode, ForkNumber forknum)
 		int i = LAST_WRITTEN_CACHE_SIZE;
 		while (--i >= 0)
 		{
-			RnodeForkKey key = XLogCtl->lastWrittenPageCacheOid[i];
-			if (rnode == key.rnode &&  forknum == key.rnode)
+			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
 			{
 				if (lsn > XLogCtl->lastWrittenPageCacheLsn[i])
 				{
@@ -8902,9 +8897,7 @@ SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode, ForkNumber forknum)
 			{
 				XLogCtl->lastWrittenPageLsn = XLogCtl->lastWrittenPageCacheLsn[victim];
 			}
-			struct RnodeForkKey key = {rnode, forknum};
-			XLogCtl->lastWrittenPageCacheOid[victim] = key;
-			XLogCtl->lastWrittenPageCacheLsn[victim] = lsn;
+			XLogCtl->lastWrittenPageCacheOid[victim] = rnode;
 		}
  	}
 	SpinLockRelease(&XLogCtl->info_lck);
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index e60922b8fb9..f562e433747 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -674,7 +674,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn, InvalidOid, InvalidForkNumber);
+				SetLastWrittenPageLSN(lsn, InvalidOid);
 			}
 		}
 		table_endscan(scan);
@@ -2225,7 +2225,7 @@ dbase_redo(XLogReaderState *record)
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
 
-			SetLastWrittenPageLSN(lsn, InvalidOid, InvalidForkNumber);
+			SetLastWrittenPageLSN(lsn, InvalidOid);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index a8639e1a78d..ae0fd33a834 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -351,8 +351,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn, Oid relfilenode, ForkNumber forknum);
-extern XLogRecPtr GetLastWrittenPageLSN(Oid relfilenode, ForkNumber forknum);
+extern void SetLastWrittenPageLSN(XLogRecPtr lsn, Oid relfilenode);
+extern XLogRecPtr GetLastWrittenPageLSN(Oid relfilenode);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From f27f1fc050c8e12e851b80e467999b6763524787 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 31 May 2022 17:51:42 +0300
Subject: [PATCH 159/167] Path relNode to zenith_get_request_lsn

---
 contrib/neon/pagestore_smgr.c     | 12 ++++++------
 src/backend/access/transam/xlog.c |  9 +++++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 964d85e0205..8843cfc25a8 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -604,7 +604,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool *latest, RelFileNode *rnode, ForkNumber forknum)
+zenith_get_request_lsn(bool *latest, Oid rnode)
 {
 	XLogRecPtr	lsn;
 
@@ -631,7 +631,7 @@ zenith_get_request_lsn(bool *latest, RelFileNode *rnode, ForkNumber forknum)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN(rnode->relNode);
+		lsn = GetLastWrittenPageLSN(rnode);
 		Assert(lsn != InvalidXLogRecPtr);
 		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
@@ -717,7 +717,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node, forkNum);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode);
 	{
 		ZenithExistsRequest request = {
 			.req.tag = T_ZenithExistsRequest,
@@ -1080,7 +1080,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node, forkNum);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode);
 	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1285,7 +1285,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, &reln->smgr_rnode.node, forknum);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode);
 	{
 		ZenithNblocksRequest request = {
 			.req.tag = T_ZenithNblocksRequest,
@@ -1345,7 +1345,7 @@ zenith_dbsize(Oid dbNode)
 	XLogRecPtr request_lsn;
 	bool		latest;
 
-	request_lsn = zenith_get_request_lsn(&latest, NULL, InvalidForkNumber);
+	request_lsn = zenith_get_request_lsn(&latest, InvalidOid);
 	{
 		ZenithDbSizeRequest request = {
 			.req.tag = T_ZenithDbSizeRequest,
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 3f8a018afa8..526330c0003 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8828,7 +8828,9 @@ GetInsertRecPtr(void)
 }
 
 /*
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page
+ * GetLastWrittenPageLSN -- Returns maximal LSN of written page.
+ * It returns either cached last written LSN of particular relation,
+ * either global maximum of last written LSNs among all relations.
  */
 XLogRecPtr
 GetLastWrittenPageLSN(Oid rnode)
@@ -8862,7 +8864,10 @@ GetLastWrittenPageLSN(Oid rnode)
 }
 
 /*
- * SetLastWrittenPageLSN -- Set maximal LSN of written page
+ * SetLastWrittenPageLSN -- Set maximal LSN of written page.
+ * We maintain small shared cache for last written LSN of least recently updated
+ * pages. This cache allows to keep global lastWrittenPageLsn unchanged and
+ * so avoid long wait for LSN for read requests to other relations.
  */
 void
 SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)

From 37a0325b0dd7b874101b473ae5638f02ef454dd9 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 2 Jun 2022 18:28:21 +0300
Subject: [PATCH 160/167] Fix caching in SetLastWrittenLsn function

---
 src/backend/access/transam/xlog.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 526330c0003..90b24464481 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8903,6 +8903,7 @@ SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
 				XLogCtl->lastWrittenPageLsn = XLogCtl->lastWrittenPageCacheLsn[victim];
 			}
 			XLogCtl->lastWrittenPageCacheOid[victim] = rnode;
+			XLogCtl->lastWrittenPageCacheLsn[victim] = lsn;
 		}
  	}
 	SpinLockRelease(&XLogCtl->info_lck);

From c87176058e20deb843a0ca689cec57a61954ca55 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 22 Jun 2022 08:26:10 +0300
Subject: [PATCH 161/167] Keep last written LSN for each relation chunk to
 handle uploads of multiple relations and append-only tables

---
 contrib/neon/pagestore_smgr.c            |  28 ++--
 src/backend/access/gin/gininsert.c       |   3 +-
 src/backend/access/gist/gistbuild.c      |   8 +-
 src/backend/access/spgist/spginsert.c    |   4 +-
 src/backend/access/transam/xlog.c        | 184 +++++++++++++++--------
 src/backend/commands/dbcommands.c        |   4 +-
 src/backend/storage/lmgr/lwlocknames.txt |   1 +
 src/backend/utils/misc/guc.c             |  10 ++
 src/include/access/xlog.h                |   5 +-
 9 files changed, 162 insertions(+), 85 deletions(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 8843cfc25a8..f4f0dbe0313 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -84,6 +84,10 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
+/*
+ * Pseudo block number used to associate LSN with relation metadata (relation size */
+#define REL_METADATA_PSEUDO_BLOCKNO 0xFFFFFFFE
+
 page_server_api *page_server;
 
 /* GUCs */
@@ -559,7 +563,7 @@ zenith_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	 * Remember the LSN on this page. When we read the page again, we must
 	 * read the same or newer version of it.
 	 */
-	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
+	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, blocknum, blocknum);
 }
 
 
@@ -604,7 +608,7 @@ zm_adjust_lsn(XLogRecPtr lsn)
  * Return LSN for requesting pages and number of blocks from page server
  */
 static XLogRecPtr
-zenith_get_request_lsn(bool *latest, Oid rnode)
+zenith_get_request_lsn(bool *latest, Oid rnode, BlockNumber blkno)
 {
 	XLogRecPtr	lsn;
 
@@ -631,9 +635,9 @@ zenith_get_request_lsn(bool *latest, Oid rnode)
 		 * so our request cannot concern those.
 		 */
 		*latest = true;
-		lsn = GetLastWrittenPageLSN(rnode);
+		lsn = GetLastWrittenLSN(rnode, blkno);
 		Assert(lsn != InvalidXLogRecPtr);
-		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenPageLSN lsn %X/%X ",
+		elog(DEBUG1, "zenith_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
 			 (uint32) ((lsn) >> 32), (uint32) (lsn));
 
 		lsn = zm_adjust_lsn(lsn);
@@ -717,7 +721,7 @@ zenith_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		ZenithExistsRequest request = {
 			.req.tag = T_ZenithExistsRequest,
@@ -792,10 +796,10 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWrittenPageLSN() when a new
+	 * relation. Currently, we don't call SetLastWritten7LSN() when a new
 	 * relation created, so if we didn't remember the size in the relsize
 	 * cache, we might call smgrnblocks() on the newly-created relation before
-	 * the creation WAL record hass been received by the page server.
+	 * the creation WAL record has been received by the page server.
 	 */
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
 
@@ -905,6 +909,8 @@ zenith_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif
+
+	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
 }
 
 /*
@@ -1080,7 +1086,7 @@ zenith_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, blkno);
 	zenith_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
@@ -1285,7 +1291,7 @@ zenith_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode);
+	request_lsn = zenith_get_request_lsn(&latest, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		ZenithNblocksRequest request = {
 			.req.tag = T_ZenithNblocksRequest,
@@ -1345,7 +1351,7 @@ zenith_dbsize(Oid dbNode)
 	XLogRecPtr request_lsn;
 	bool		latest;
 
-	request_lsn = zenith_get_request_lsn(&latest, InvalidOid);
+	request_lsn = zenith_get_request_lsn(&latest, InvalidOid, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		ZenithDbSizeRequest request = {
 			.req.tag = T_ZenithDbSizeRequest,
@@ -1432,7 +1438,7 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
-	SetLastWrittenPageLSN(lsn, reln->smgr_rnode.node.relNode);
+	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index b603ab1a567..9627ea5f346 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -421,9 +421,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode, 0, RelationGetNumberOfBlocks(index));
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
-
 	smgr_end_unlogged_build(index->rd_smgr);
 
 	/*
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index dc401767cb7..ee8ab42166b 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -335,9 +335,10 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 			log_newpage_range(index, MAIN_FORKNUM,
 							  0, RelationGetNumberOfBlocks(index),
 							  true);
+			SetLastWrittenLSN(XactLastRecEnd,
+							  index->rd_smgr->smgr_rnode.node.relNode,
+							  0, RelationGetNumberOfBlocks(index));
 		}
-		SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
-
 		smgr_end_unlogged_build(index->rd_smgr);
 	}
 
@@ -469,7 +470,8 @@ gist_indexsortbuild(GISTBuildState *state)
 
 		lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
 					pagestate->page, true);
-		SetLastWrittenPageLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode);
+		SetLastWrittenLSN(lsn, state->indexrel->rd_smgr->smgr_rnode.node.relNode,
+							  GIST_ROOT_BLKNO, GIST_ROOT_BLKNO);
 	}
 
 	pfree(pagestate->page);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 502fd11ad0a..f293430f891 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -143,9 +143,9 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		log_newpage_range(index, MAIN_FORKNUM,
 						  0, RelationGetNumberOfBlocks(index),
 						  true);
+		SetLastWrittenLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode,
+						  0, RelationGetNumberOfBlocks(index));
 	}
-	SetLastWrittenPageLSN(XactLastRecEnd, index->rd_smgr->smgr_rnode.node.relNode);
-
 	smgr_end_unlogged_build(index->rd_smgr);
 
 	result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 90b24464481..edb2c2313a6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -87,9 +87,6 @@ extern uint32 bootstrap_data_checksum_version;
 #define RECOVERY_COMMAND_FILE	"recovery.conf"
 #define RECOVERY_COMMAND_DONE	"recovery.done"
 
-/* Size of last written page LSN cache. Should not be large because sequential search is used. */
-#define LAST_WRITTEN_CACHE_SIZE 4
-
 /* User-settable parameters */
 int			max_wal_size_mb = 1024; /* 1 GB */
 int			min_wal_size_mb = 80;	/* 80 MB */
@@ -115,6 +112,7 @@ int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 bool		track_wal_io_timing = false;
 uint64      predefined_sysidentifier;
+int			lastWrittenLsnCacheSize;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -184,6 +182,28 @@ const struct config_enum_entry recovery_target_action_options[] = {
 	{NULL, 0, false}
 };
 
+
+/*
+ * We are not taken in acccout dbnode, spcnode, forknum fields of
+ * relation tag, because possibility of collision is assumed to be small
+ * and should not affect performance. And reducing cache key size speed-up
+ * hash calculation and comparison.
+ */
+typedef struct LastWrittenLsnCacheKey
+{
+	Oid         relid;
+	BlockNumber basket;
+} LastWrittenLsnCacheKey;
+
+typedef struct LastWrittenLsnCacheEntry
+{
+	LastWrittenLsnCacheKey key;
+	XLogRecPtr             lsn;
+	/* L2-List for LRU replacement algorithm */
+	struct LastWrittenLsnCacheEntry* next;
+	struct LastWrittenLsnCacheEntry* prev;
+} LastWrittenLsnCacheEntry;
+
 /*
  * Statistics for current checkpoint are collected in this global struct.
  * Because only the checkpointer or a stand-alone backend can perform
@@ -753,13 +773,14 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 
 	/*
-	 * Cache of last written page LSN.
-	 * We store this value for up to LAST_WRITTEN_CACHE_SIZE relations + maximum for all other relations.
+	 * Maximal last written LSN for pges not present in lastWrittenLsnCache
+	 */
+	XLogRecPtr  maxLastWrittenLsn;
+
+	/*
+	 * Double linked list to implement LRU replacement policy for last written LSN cache
 	 */
-	XLogRecPtr	lastWrittenPageLsn;
-	XLogRecPtr	lastWrittenPageCacheLsn[LAST_WRITTEN_CACHE_SIZE];
-	Oid			lastWrittenPageCacheOid[LAST_WRITTEN_CACHE_SIZE];
-	size_t		lastWrittenPageCacheClock; /* Pointer of the victim element for clock replacement algorithm */
+	LastWrittenLsnCacheEntry lastWrittenLsnLRU;
 
 	/* neon: copy of startup's RedoStartLSN for walproposer's use */
 	XLogRecPtr	RedoStartLSN;
@@ -783,6 +804,11 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
  */
 static ControlFileData *ControlFile = NULL;
 
+
+static HTAB *lastWrittenLsnCache;
+
+#define LAST_WRITTEN_LSN_CACHE_BASKET 1024 /* blocks = 8Mb */
+
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
  * multiple evaluation!
@@ -5143,11 +5169,8 @@ LocalProcessControlFile(bool reset)
 	ReadControlFile();
 }
 
-/*
- * Initialization of shared memory for XLOG
- */
-Size
-XLOGShmemSize(void)
+static Size
+XLOGCtlShmemSize(void)
 {
 	Size		size;
 
@@ -5187,6 +5210,16 @@ XLOGShmemSize(void)
 	return size;
 }
 
+/*
+ * Initialization of shared memory for XLOG
+ */
+Size
+XLOGShmemSize(void)
+{
+	return XLOGCtlShmemSize() +
+		hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry));
+}
+
 void
 XLOGShmemInit(void)
 {
@@ -5216,6 +5249,15 @@ XLOGShmemInit(void)
 	XLogCtl = (XLogCtlData *)
 		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
 
+	{
+		static HASHCTL info;
+		info.keysize = sizeof(LastWrittenLsnCacheKey);
+		info.entrysize = sizeof(LastWrittenLsnCacheEntry);
+		lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
+											lastWrittenLsnCacheSize, lastWrittenLsnCacheSize,
+											&info,
+											HASH_ELEM | HASH_BLOBS);
+	}
 	localControlFile = ControlFile;
 	ControlFile = (ControlFileData *)
 		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
@@ -8100,13 +8142,8 @@ StartupXLOG(void)
 
 	XLogCtl->LogwrtRqst.Write = EndOfLog;
 	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-	XLogCtl->lastWrittenPageLsn = EndOfLog;
-	for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
-	{
-		XLogCtl->lastWrittenPageCacheLsn[i] = InvalidXLogRecPtr;
-		XLogCtl->lastWrittenPageCacheOid[i] = InvalidOid;
-	}
-	XLogCtl->lastWrittenPageCacheClock = 0;
+	XLogCtl->maxLastWrittenLsn = EndOfLog;
+	XLogCtl->lastWrittenLsnLRU.next = XLogCtl->lastWrittenLsnLRU.prev = &XLogCtl->lastWrittenLsnLRU;
 	LocalSetXLogInsertAllowed();
 
 	/* If necessary, write overwrite-contrecord before doing anything else */
@@ -8828,85 +8865,106 @@ GetInsertRecPtr(void)
 }
 
 /*
- * GetLastWrittenPageLSN -- Returns maximal LSN of written page.
+ * GetLastWrittenLSN -- Returns maximal LSN of written page.
  * It returns either cached last written LSN of particular relation,
  * either global maximum of last written LSNs among all relations.
  */
 XLogRecPtr
-GetLastWrittenPageLSN(Oid rnode)
+GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
 {
 	XLogRecPtr lsn;
-	SpinLockAcquire(&XLogCtl->info_lck);
-	lsn = XLogCtl->lastWrittenPageLsn;
+	LastWrittenLsnCacheEntry* entry;
+
+	LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
+
+	/* Maximal last written LSN among all non-cached pages */
+	lsn = XLogCtl->maxLastWrittenLsn;
+
 	if (rnode != InvalidOid)
 	{
-		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
-		{
-			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
-			{
-				lsn = XLogCtl->lastWrittenPageCacheLsn[i];
-				break;
-			}
-		}
+		LastWrittenLsnCacheKey key;
+		key.relid = rnode;
+		key.basket = blkno / LAST_WRITTEN_LSN_CACHE_BASKET;
+		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
+		if (entry != NULL)
+			lsn = entry->lsn;
 	}
 	else
 	{
+		HASH_SEQ_STATUS seq;
 		/* Find maximum of all cached LSNs */
-		for (int i = 0; i < LAST_WRITTEN_CACHE_SIZE; i++)
+		hash_seq_init(&seq, lastWrittenLsnCache);
+		while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL)
 		{
-			if (XLogCtl->lastWrittenPageCacheLsn[i] > lsn)
-				lsn = XLogCtl->lastWrittenPageCacheLsn[i];
+			if (entry->lsn > lsn)
+				lsn = entry->lsn;
 		}
 	}
-	SpinLockRelease(&XLogCtl->info_lck);
+	LWLockRelease(LastWrittenLsnLock);
 
 	return lsn;
 }
 
 /*
- * SetLastWrittenPageLSN -- Set maximal LSN of written page.
- * We maintain small shared cache for last written LSN of least recently updated
- * pages. This cache allows to keep global lastWrittenPageLsn unchanged and
- * so avoid long wait for LSN for read requests to other relations.
+ * SetLastWrittenLSN -- Set maximal LSN of written page.
+ * We maintain cache of last written LSNs with limited size and LRU replacement
+ * policy. To reduce cache size we store max LSN not for each page, but for
+ * backet (1024 blocks). This cache allows to use old LSN when
+ * requesting pages of unchanged or appended relations.
  */
 void
-SetLastWrittenPageLSN(XLogRecPtr lsn, Oid rnode)
+SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
 {
 	if (lsn == InvalidXLogRecPtr)
 		return;
 
-	SpinLockAcquire(&XLogCtl->info_lck);
+	LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
 	if (rnode == InvalidOid)
 	{
-		if (lsn > XLogCtl->lastWrittenPageLsn)
-			XLogCtl->lastWrittenPageLsn = lsn;
+		if (lsn > XLogCtl->maxLastWrittenLsn)
+			XLogCtl->maxLastWrittenLsn = lsn;
 	}
 	else
 	{
-		int i = LAST_WRITTEN_CACHE_SIZE;
-		while (--i >= 0)
+		LastWrittenLsnCacheEntry* entry;
+		LastWrittenLsnCacheKey key;
+		bool found;
+		BlockNumber basket;
+
+		key.relid = rnode;
+		for (basket = from / LAST_WRITTEN_LSN_CACHE_BASKET;
+			 basket <= till / LAST_WRITTEN_LSN_CACHE_BASKET;
+			 basket++)
 		{
-			if (rnode == XLogCtl->lastWrittenPageCacheOid[i])
+			key.basket = basket;
+			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
+			if (found)
 			{
-				if (lsn > XLogCtl->lastWrittenPageCacheLsn[i])
-				{
-					XLogCtl->lastWrittenPageCacheLsn[i] = lsn;
-				}
-				break;
+				if (lsn > entry->lsn)
+					entry->lsn = lsn;
+				/* Unlink from LRU list */
+				entry->next->prev = entry->prev;
+				entry->prev->next = entry->next;
 			}
-		}
-		if (i < 0)
-		{
-			int victim = ++XLogCtl->lastWrittenPageCacheClock % LAST_WRITTEN_CACHE_SIZE;
-			if (XLogCtl->lastWrittenPageCacheLsn[victim] > XLogCtl->lastWrittenPageLsn)
+			else
 			{
-				XLogCtl->lastWrittenPageLsn = XLogCtl->lastWrittenPageCacheLsn[victim];
+				entry->lsn = lsn;
+				if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize)
+				{
+					/* Replace least recently used entry */
+					LastWrittenLsnCacheEntry* victim = XLogCtl->lastWrittenLsnLRU.prev;
+					victim->next->prev = victim->prev;
+					victim->prev->next = victim->next;
+					hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL);
+				}
 			}
-			XLogCtl->lastWrittenPageCacheOid[victim] = rnode;
-			XLogCtl->lastWrittenPageCacheLsn[victim] = lsn;
+			/* Link to the head of LRU list */
+			entry->next = XLogCtl->lastWrittenLsnLRU.next;
+			entry->prev = &XLogCtl->lastWrittenLsnLRU;
+			XLogCtl->lastWrittenLsnLRU.next = entry->next->prev = entry;
 		}
- 	}
-	SpinLockRelease(&XLogCtl->info_lck);
+	}
+	LWLockRelease(LastWrittenLsnLock);
 }
 
 /*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index f562e433747..9e21776b895 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -674,7 +674,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				lsn = XLogInsert(RM_DBASE_ID,
 								 XLOG_DBASE_CREATE | XLR_SPECIAL_REL_UPDATE);
-				SetLastWrittenPageLSN(lsn, InvalidOid);
+				SetLastWrittenLSN(lsn, InvalidOid, 0, 0);
 			}
 		}
 		table_endscan(scan);
@@ -2225,7 +2225,7 @@ dbase_redo(XLogReaderState *record)
 		{
 			XLogRecPtr	lsn = record->EndRecPtr;
 
-			SetLastWrittenPageLSN(lsn, InvalidOid);
+			SetLastWrittenLSN(lsn, InvalidOid, 0, 0);
 		}
 	}
 	else if (info == XLOG_DBASE_DROP)
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c2956..b4652c33ff6 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock					44
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
+LastWrittenLsnLock					48
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ed369c0be75..3565ea828cc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2366,6 +2366,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"lsn_cache_size", PGC_POSTMASTER, UNGROUPED,
+			gettext_noop("Size of las written LSN cache used by Neon."),
+			NULL
+		},
+		&lastWrittenLsnCacheSize,
+		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb basket */
+		NULL, NULL, NULL
+	},
+
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ae0fd33a834..cd4e6c7f876 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -132,6 +132,7 @@ extern char *PrimaryConnInfo;
 extern char *PrimarySlotName;
 extern bool wal_receiver_create_temp_slot;
 extern bool track_wal_io_timing;
+extern int  lastWrittenLsnCacheSize;
 
 /* indirectly set via GUC system */
 extern TransactionId recoveryTargetXid;
@@ -351,8 +352,8 @@ extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void RemovePromoteSignalFiles(void);
 
-extern void SetLastWrittenPageLSN(XLogRecPtr lsn, Oid relfilenode);
-extern XLogRecPtr GetLastWrittenPageLSN(Oid relfilenode);
+extern void SetLastWrittenLSN(XLogRecPtr lsn, Oid relfilenode, BlockNumber from, BlockNumber till);
+extern XLogRecPtr GetLastWrittenLSN(Oid relfilenode, BlockNumber blkno);
 
 extern XLogRecPtr GetRedoStartLsn(void);
 

From 2e9ce7abc97c5f32ce544b54a0dcf0bb0e4e188e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 22 Jun 2022 10:23:48 +0300
Subject: [PATCH 162/167] Update contrib/neon/pagestore_smgr.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 contrib/neon/pagestore_smgr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index f4f0dbe0313..f24b337c46b 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -796,7 +796,7 @@ zenith_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 *
 	 * FIXME: This is currently not just an optimization, but required for
 	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWritten7LSN() when a new
+	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
 	 * relation created, so if we didn't remember the size in the relsize
 	 * cache, we might call smgrnblocks() on the newly-created relation before
 	 * the creation WAL record has been received by the page server.

From c977ff50d32f9ba80aa7e84391a1ff258441d04b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 22 Jun 2022 10:37:14 +0300
Subject: [PATCH 163/167] Adjust max last written LSN of non cached relations
 in case of eviction from cache

---
 contrib/neon/pagestore_smgr.c     | 2 +-
 src/backend/access/transam/xlog.c | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index f24b337c46b..565e7c4e32d 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -86,7 +86,7 @@ const int	SmgrTrace = DEBUG5;
 
 /*
  * Pseudo block number used to associate LSN with relation metadata (relation size */
-#define REL_METADATA_PSEUDO_BLOCKNO 0xFFFFFFFE
+#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
 
 page_server_api *page_server;
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index edb2c2313a6..e799c89810c 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8953,6 +8953,10 @@ SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
 				{
 					/* Replace least recently used entry */
 					LastWrittenLsnCacheEntry* victim = XLogCtl->lastWrittenLsnLRU.prev;
+					/* Adjust max LSN for not cached relations/chunks if needed */
+					if (victim->lsn > XLogCtl->maxLastWrittenLsn)
+						XLogCtl->maxLastWrittenLsn = victim->lsn;
+
 					victim->next->prev = victim->prev;
 					victim->prev->next = victim->next;
 					hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL);

From 2e4bf14c91cf59314214117aa7661806e7ae6086 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 23 Jun 2022 16:39:38 +0300
Subject: [PATCH 164/167] Update comments

---
 src/backend/access/transam/xlog.c | 45 ++++++++++++++++++++-----------
 src/backend/utils/misc/guc.c      |  2 +-
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e799c89810c..1b6705edbce 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -184,7 +184,7 @@ const struct config_enum_entry recovery_target_action_options[] = {
 
 
 /*
- * We are not taken in acccout dbnode, spcnode, forknum fields of
+ * We are not taken in account dbnode, spcnode, forknum fields of
  * relation tag, because possibility of collision is assumed to be small
  * and should not affect performance. And reducing cache key size speed-up
  * hash calculation and comparison.
@@ -192,7 +192,7 @@ const struct config_enum_entry recovery_target_action_options[] = {
 typedef struct LastWrittenLsnCacheKey
 {
 	Oid         relid;
-	BlockNumber basket;
+	BlockNumber bucket;
 } LastWrittenLsnCacheKey;
 
 typedef struct LastWrittenLsnCacheEntry
@@ -773,12 +773,13 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastFpwDisableRecPtr;
 
 	/*
-	 * Maximal last written LSN for pges not present in lastWrittenLsnCache
+	 * Maximal last written LSN for pages not present in lastWrittenLsnCache
 	 */
 	XLogRecPtr  maxLastWrittenLsn;
 
 	/*
-	 * Double linked list to implement LRU replacement policy for last written LSN cache
+	 * Double linked list to implement LRU replacement policy for last written LSN cache.
+	 * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'.
 	 */
 	LastWrittenLsnCacheEntry lastWrittenLsnLRU;
 
@@ -804,10 +805,18 @@ static WALInsertLockPadded *WALInsertLocks = NULL;
  */
 static ControlFileData *ControlFile = NULL;
 
+#define LAST_WRITTEN_LSN_CACHE_BUCKET 1024 /* blocks = 8Mb */
 
-static HTAB *lastWrittenLsnCache;
 
-#define LAST_WRITTEN_LSN_CACHE_BASKET 1024 /* blocks = 8Mb */
+/*
+ * Cache of last written LSN for each relation chunk (hash bucket).
+ * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
+ * relation metadata update.
+ * Size of the cache is limited by GUC variable lastWrittnLsnCacheSize ("lsn_cache_size"),
+ * pages are replaced using LRU algirithm, based on L2-list.
+ * Access to this cache is protected by 'LastWrittenLsnLock'.
+ */
+static HTAB *lastWrittenLsnCache;
 
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
@@ -8866,8 +8875,11 @@ GetInsertRecPtr(void)
 
 /*
  * GetLastWrittenLSN -- Returns maximal LSN of written page.
- * It returns either cached last written LSN of particular relation,
- * either global maximum of last written LSNs among all relations.
+ * It returns an upper bound for the last written LSN of a given page,
+ * either from a cached last written LSN or a global maximum last written LSN.
+ * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn.
+ * If cache is large enough ,iterting through all hash items may be rather expensive.
+ * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical.
  */
 XLogRecPtr
 GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
@@ -8884,7 +8896,7 @@ GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
 	{
 		LastWrittenLsnCacheKey key;
 		key.relid = rnode;
-		key.basket = blkno / LAST_WRITTEN_LSN_CACHE_BASKET;
+		key.bucket = blkno / LAST_WRITTEN_LSN_CACHE_BUCKET;
 		entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL);
 		if (entry != NULL)
 			lsn = entry->lsn;
@@ -8909,8 +8921,11 @@ GetLastWrittenLSN(Oid rnode, BlockNumber blkno)
  * SetLastWrittenLSN -- Set maximal LSN of written page.
  * We maintain cache of last written LSNs with limited size and LRU replacement
  * policy. To reduce cache size we store max LSN not for each page, but for
- * backet (1024 blocks). This cache allows to use old LSN when
+ * bucket (1024 blocks). This cache allows to use old LSN when
  * requesting pages of unchanged or appended relations.
+ *
+ * rnode can be InvalidOid, in this case maxLastWrittenLsn is updated. SetLastWrittensn with InvalidOid
+ * is used by createdb and dbase_redo functions.
  */
 void
 SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
@@ -8929,14 +8944,14 @@ SetLastWrittenLSN(XLogRecPtr lsn, Oid rnode, BlockNumber from, BlockNumber till)
 		LastWrittenLsnCacheEntry* entry;
 		LastWrittenLsnCacheKey key;
 		bool found;
-		BlockNumber basket;
+		BlockNumber bucket;
 
 		key.relid = rnode;
-		for (basket = from / LAST_WRITTEN_LSN_CACHE_BASKET;
-			 basket <= till / LAST_WRITTEN_LSN_CACHE_BASKET;
-			 basket++)
+		for (bucket = from / LAST_WRITTEN_LSN_CACHE_BUCKET;
+			 bucket <= till / LAST_WRITTEN_LSN_CACHE_BUCKET;
+			 bucket++)
 		{
-			key.basket = basket;
+			key.bucket = bucket;
 			entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found);
 			if (found)
 			{
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3565ea828cc..ffc2b440285 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2372,7 +2372,7 @@ static struct config_int ConfigureNamesInt[] =
 			NULL
 		},
 		&lastWrittenLsnCacheSize,
-		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb basket */
+		1024, 10, 1000000, /* 1024 is enough to hold 10GB database with 8Mb bucket */
 		NULL, NULL, NULL
 	},
 

From 70c79ef606686a1648e044ff92c9d6637e4750c0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 23 Jun 2022 18:11:50 +0300
Subject: [PATCH 165/167] Update src/backend/access/transam/xlog.c

Co-authored-by: Thang Pham <phamducthang1234@gmail.com>
---
 src/backend/access/transam/xlog.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1b6705edbce..c39e5743a1f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -812,8 +812,8 @@ static ControlFileData *ControlFile = NULL;
  * Cache of last written LSN for each relation chunk (hash bucket).
  * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last
  * relation metadata update.
- * Size of the cache is limited by GUC variable lastWrittnLsnCacheSize ("lsn_cache_size"),
- * pages are replaced using LRU algirithm, based on L2-list.
+ * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"),
+ * pages are replaced using LRU algorithm, based on L2-list.
  * Access to this cache is protected by 'LastWrittenLsnLock'.
  */
 static HTAB *lastWrittenLsnCache;

From b329959b17b3f979917e4182fa887dbd8701afe0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 7 Jul 2022 22:03:27 +0300
Subject: [PATCH 166/167] Update contrib/neon/pagestore_smgr.c

Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
---
 contrib/neon/pagestore_smgr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index 565e7c4e32d..f6e36975662 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -85,7 +85,8 @@ static char *hexdump_page(char *page);
 const int	SmgrTrace = DEBUG5;
 
 /*
- * Pseudo block number used to associate LSN with relation metadata (relation size */
+ * Pseudo block number used to associate LSN with relation metadata (relation size)
+ */
 #define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber
 
 page_server_api *page_server;

From a9422b7dad1acb40e7ab27d54aef8606c91103bf Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 7 Jul 2022 22:56:39 +0300
Subject: [PATCH 167/167] Add requested comment

---
 contrib/neon/pagestore_smgr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/contrib/neon/pagestore_smgr.c b/contrib/neon/pagestore_smgr.c
index f6e36975662..0b834811a33 100644
--- a/contrib/neon/pagestore_smgr.c
+++ b/contrib/neon/pagestore_smgr.c
@@ -1439,6 +1439,10 @@ zenith_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	 */
 	XLogFlush(lsn);
 
+	/*
+	 * Truncate may affect several chunks of relations. So we should either update last written LSN for all of them,
+	 * either update LSN for "dummy" metadata block. Second approach seems to be more efficient.
+	 */
 	SetLastWrittenLSN(lsn, reln->smgr_rnode.node.relNode, REL_METADATA_PSEUDO_BLOCKNO, REL_METADATA_PSEUDO_BLOCKNO);
 
 #ifdef DEBUG_COMPARE_LOCAL