From d61e9241035c3d03eb90a5722b0d38769f92e18a Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 27 May 2024 15:57:57 +0300
Subject: [PATCH 01/31] Fix connect to PS on MacOS/X (#7885)

## Problem

After [0e4f1826805d040a23c25c54f3993a942755dbc2] which introduce async
connect
Neon is not able to connect to page server.

## Summary of changes

Perform sync commit at MacOS/X

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/libpagestore.c   | 89 +++++++++++++++++++-------------------
 pgxn/neon/pagestore_smgr.c | 10 ++---
 2 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a9c8d59c3a9f..5eae2d8204f0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -125,13 +125,6 @@ typedef struct
 	 *	- WL_EXIT_ON_PM_DEATH.
 	 */
 	WaitEventSet   *wes_read;
-	/*---
-	 * WaitEventSet containing:
-	 *	- WL_SOCKET_WRITABLE on 'conn'
-	 *	- WL_LATCH_SET on MyLatch, and
-	 *	- WL_EXIT_ON_PM_DEATH.
-	 */
-	WaitEventSet   *wes_write;
 } PageServer;
 
 static PageServer page_servers[MAX_SHARDS];
@@ -336,11 +329,6 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 		FreeWaitEventSet(shard->wes_read);
 		shard->wes_read = NULL;
 	}
-	if (shard->wes_write)
-	{
-		FreeWaitEventSet(shard->wes_write);
-		shard->wes_write = NULL;
-	}
 	if (shard->conn)
 	{
 		PQfinish(shard->conn);
@@ -436,22 +424,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			return false;
 		}
 
-		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
-
-		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
-						  PQsocket(shard->conn),
-						  NULL, NULL);
-
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}
@@ -460,13 +432,12 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		char	   *pagestream_query;
 		int			ps_send_query_ret;
 		bool		connected = false;
-
+		int poll_result = PGRES_POLLING_WRITING;
 		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
 
 		do
 		{
 			WaitEvent	event;
-			int			poll_result = PQconnectPoll(shard->conn);
 
 			switch (poll_result)
 			{
@@ -497,25 +468,45 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				}
 			case PGRES_POLLING_READING:
 				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_READABLE)
+						break;
+				}
 				/* PQconnectPoll() handles the socket polling state updates */
 
 				break;
 			case PGRES_POLLING_WRITING:
 				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
+				while (true)
+				{
+					int rc = WaitLatchOrSocket(MyLatch,
+											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
+											   PQsocket(shard->conn),
+											   0,
+											   PG_WAIT_EXTENSION);
+					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
+					if (rc & WL_LATCH_SET)
+					{
+						ResetLatch(MyLatch);
+						/* query cancellation, backend shutdown */
+						CHECK_FOR_INTERRUPTS();
+					}
+					if (rc & WL_SOCKET_WRITEABLE)
+						break;
+				}
 				/* PQconnectPoll() handles the socket polling state updates */
 
 				break;
@@ -524,12 +515,22 @@ pageserver_connect(shardno_t shard_no, int elevel)
 				connected = true;
 				break;
 			}
+			poll_result = PQconnectPoll(shard->conn);
+			elog(DEBUG5, "PQconnectPoll=>%d", poll_result);
 		}
 		while (!connected);
 
 		/* No more polling needed; connection succeeded */
 		shard->last_connect_time = GetCurrentTimestamp();
 
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+
 		switch (neon_protocol_version)
 		{
 		case 2:
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index ac505fe6fbb0..0e4d210be8ce 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -584,9 +584,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->response != NULL ||
 		slot->my_ring_index != MyPState->ring_receive)
 		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
-					   slot->status, (size_t) (void *) slot->response,
-					   slot->my_ring_index, MyPState->ring_receive);
+					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   slot->status, slot->response,
+					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
 
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
@@ -606,8 +606,8 @@ prefetch_read(PrefetchRequest *slot)
 	else
 	{
 		neon_shard_log(slot->shard_no, WARNING,
-					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   slot->my_ring_index,
+					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   (long)slot->my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
 					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;

From 4a0ce9512b5eb26b636006cda2488411d07bfc03 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 27 May 2024 17:35:46 +0300
Subject: [PATCH 02/31] Add safekeeper test truncating WAL.

We do it as a part of more complicated tests like test_compute_restarts, but
let's have a simple test as well.
---
 .../regress/test_wal_acceptor_async.py        | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b5d86de5743b..715d22eed83d 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -531,6 +531,64 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_recovery_uncommitted(env))
 
 
+async def run_wal_truncation(env: NeonEnv):
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (sk1, sk2, sk3) = env.safekeepers
+
+    ep = env.endpoints.create_start("main")
+    ep.safe_psql("create table t (key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    # insert with only one sk3 up to create tail of flushed but not committed WAL on it
+    sk1.stop()
+    sk2.stop()
+    conn = await ep.connect_async()
+    # query should hang, so execute in separate task
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
+    )
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # it must still be not finished
+    assert not bg_query.done()
+    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
+    ep.stop_and_destroy()
+
+    # stop sk3 as well
+    sk3.stop()
+
+    # now start sk1 and sk2 and make them commit something
+    sk1.start()
+    sk2.start()
+    ep = env.endpoints.create_start(
+        "main",
+    )
+    ep.safe_psql("insert into t select generate_series(1, 200), 'payload'")
+
+    # start sk3 and wait for it to catch up
+    sk3.start()
+    flush_lsn = Lsn(ep.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()"))
+    await wait_for_lsn(sk3, tenant_id, timeline_id, flush_lsn)
+
+    timeline_start_lsn = sk1.get_timeline_start_lsn(tenant_id, timeline_id)
+    digests = [
+        sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, flush_lsn)
+        for sk in [sk1, sk2]
+    ]
+    assert digests[0] == digests[1], f"digest on sk1 is {digests[0]} but on sk3 is {digests[1]}"
+
+
+# Simple deterministic test creating tail of WAL on safekeeper which is
+# truncated when majority without this sk elects walproposer starting earlier.
+def test_wal_truncation(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_wal_truncation(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
     env.neon_cli.create_branch("test_segment_init_failure")
     ep = env.endpoints.create_start("test_segment_init_failure")

From fabeff822fac24b7ba45214e907295003874252a Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 28 May 2024 13:05:33 +0200
Subject: [PATCH 03/31] Performance test for pgvector HNSW index build and
 queries (#7873)

## Problem

We want to regularly verify the performance of pgvector HNSW parallel
index builds and parallel similarity search using HNSW indexes.
The first release that considerably improved the index-build parallelism
was pgvector 0.7.0 and we want to make sure that we do not regress by
our neon compute VM settings (swap, memory over commit, pg conf etc.)

## Summary of changes

Prepare a Neon project with 1 million openAI vector embeddings (vector
size 1536).
Run HNSW indexing operations in the regression test for the various
distance metrics.
Run similarity queries using pgbench with 100 concurrent clients.

I have also added the relevant metrics to the grafana dashboards pgbench
and olape

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .github/workflows/benchmarking.yml            | 100 +++++++++++++++++-
 pyproject.toml                                |   1 +
 .../performance/pgvector/HNSW_build.sql       |  47 ++++++++
 .../performance/pgvector/IVFFLAT_build.sql    |  52 +++++++++
 test_runner/performance/pgvector/README.md    |  38 +++++++
 test_runner/performance/pgvector/loaddata.py  |  72 +++++++++++++
 ...ch_custom_script_pgvector_hsnw_queries.sql |  10 ++
 .../pgvector/pgbench_hnsw_queries.sql         |  13 +++
 test_runner/performance/test_perf_olap.py     |  34 ++++++
 test_runner/performance/test_perf_pgbench.py  |  31 ++++++
 10 files changed, 395 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/performance/pgvector/HNSW_build.sql
 create mode 100644 test_runner/performance/pgvector/IVFFLAT_build.sql
 create mode 100644 test_runner/performance/pgvector/README.md
 create mode 100644 test_runner/performance/pgvector/loaddata.py
 create mode 100644 test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
 create mode 100644 test_runner/performance/pgvector/pgbench_hnsw_queries.sql

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 1eaf05cd54a5..d5a375d70429 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -38,6 +38,11 @@ on:
         description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
         required: false
         default: false
+      run_only_pgvector_tests:
+        type: boolean
+        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
+        required: false
+        default: false
 
 defaults:
   run:
@@ -50,6 +55,7 @@ concurrency:
 
 jobs:
   bench:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -120,6 +126,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   generate-matrices:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
     #
     # Available platforms:
@@ -197,6 +204,7 @@ jobs:
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
 
   pgbench-compare:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices ]
 
     strategy:
@@ -343,6 +351,92 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  pgbench-pgvector:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
+      TEST_PG_BENCH_SCALES_MATRIX: "1"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-captest-pgvector"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERIES=("SELECT version()")
+        QUERIES+=("SHOW neon.tenant_id")
+        QUERIES+=("SHOW neon.timeline_id")
+        
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
+    - name: Benchmark pgvector hnsw indexing
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Benchmark pgvector hnsw queries
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+    
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+
   clickbench-compare:
     # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
     # we use for performance testing in pgbench-compare.
@@ -351,7 +445,7 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, pgbench-compare ]
 
     strategy:
@@ -455,7 +549,7 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, clickbench-compare ]
 
     strategy:
@@ -557,7 +651,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, tpch-compare ]
 
     strategy:
diff --git a/pyproject.toml b/pyproject.toml
index 131d1121f73c..c7f1a0751284 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ build-backend = "poetry.core.masonry.api"
 exclude = [
     "^vendor/",
     "^target/",
+    "test_runner/performance/pgvector/loaddata.py",
 ]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
diff --git a/test_runner/performance/pgvector/HNSW_build.sql b/test_runner/performance/pgvector/HNSW_build.sql
new file mode 100644
index 000000000000..9e6918b75571
--- /dev/null
+++ b/test_runner/performance/pgvector/HNSW_build.sql
@@ -0,0 +1,47 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS hnsw_test_table;
+CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO hnsw_test_table SELECT * FROM documents;
+CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create HNSW index for the supported distance metrics
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
+-- note: in a second psql session we can monitor the progress of the index build phases using
+-- the following query:
+-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'hnsw_test_table' 
+    AND a.attname = 'embeddings';
+
+-- show table sizes
+\dt+
diff --git a/test_runner/performance/pgvector/IVFFLAT_build.sql b/test_runner/performance/pgvector/IVFFLAT_build.sql
new file mode 100644
index 000000000000..338980831a90
--- /dev/null
+++ b/test_runner/performance/pgvector/IVFFLAT_build.sql
@@ -0,0 +1,52 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS ivfflat_test_table;
+CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO ivfflat_test_table SELECT * FROM documents;
+CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create ivfflat index for the supported distance metrics
+-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
+-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
+-- so we use 1000 lists
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table
+    USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
+
+\d ivfflat_test_table
+
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'ivfflat_test_table' 
+    AND a.attname = 'embeddings';
+-- show table sizes
+\dt+
+
+
diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md
new file mode 100644
index 000000000000..c55db12e74b8
--- /dev/null
+++ b/test_runner/performance/pgvector/README.md
@@ -0,0 +1,38 @@
+---
+dataset_info:
+  features:
+  - name: _id
+    dtype: string
+  - name: title
+    dtype: string
+  - name: text
+    dtype: string
+  - name: text-embedding-3-large-1536-embedding
+    sequence: float64
+  splits:
+  - name: train
+    num_bytes: 12679725776
+    num_examples: 1000000
+  download_size: 9551862565
+  dataset_size: 12679725776
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+license: mit
+task_categories:
+- feature-extraction
+language:
+- en
+size_categories:
+- 1M<n<10M
+---
+
+
+1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
+
+- Created: February 2024. 
+- Text used for Embedding: title (string) + text (string)
+- Embedding Model: OpenAI text-embedding-3-large
+- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
\ No newline at end of file
diff --git a/test_runner/performance/pgvector/loaddata.py b/test_runner/performance/pgvector/loaddata.py
new file mode 100644
index 000000000000..36c209aed3ba
--- /dev/null
+++ b/test_runner/performance/pgvector/loaddata.py
@@ -0,0 +1,72 @@
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from pgvector.psycopg2 import register_vector
+from psycopg2.extras import execute_values
+
+
+def print_usage():
+    print("Usage: loaddata.py <CONNSTR> <DATADIR>")
+
+
+def main(conn_str, directory_path):
+    # Connection to PostgreSQL
+    with psycopg2.connect(conn_str) as conn:
+        with conn.cursor() as cursor:
+            # Run SQL statements
+            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+            register_vector(conn)
+            cursor.execute("DROP TABLE IF EXISTS documents;")
+            cursor.execute(
+                """
+                CREATE TABLE documents (
+                    _id TEXT PRIMARY KEY,
+                    title TEXT,
+                    text TEXT,
+                    embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
+                );
+            """
+            )
+            conn.commit()
+
+            # List and sort Parquet files
+            parquet_files = sorted(Path(directory_path).glob("*.parquet"))
+
+            for file in parquet_files:
+                print(f"Loading {file} into PostgreSQL")
+                df = pd.read_parquet(file)
+
+                print(df.head())
+
+                data_list = [
+                    (
+                        row["_id"],
+                        row["title"],
+                        row["text"],
+                        np.array(row["text-embedding-3-large-1536-embedding"]),
+                    )
+                    for index, row in df.iterrows()
+                ]
+                # Use execute_values to perform batch insertion
+                execute_values(
+                    cursor,
+                    "INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
+                    data_list,
+                )
+                # Commit after we insert all embeddings
+                conn.commit()
+
+                print(f"Loaded {file} into PostgreSQL")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print_usage()
+        sys.exit(1)
+
+    conn_str = sys.argv[1]
+    directory_path = sys.argv[2]
+    main(conn_str, directory_path)
diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
new file mode 100644
index 000000000000..886ae9645bd7
--- /dev/null
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
@@ -0,0 +1,10 @@
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
diff --git a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
new file mode 100644
index 000000000000..5034063c1b61
--- /dev/null
+++ b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
@@ -0,0 +1,13 @@
+-- run with pooled connection
+-- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
+
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 8a9509ea44b9..2367676e6707 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -100,6 +100,25 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
 )
 # fmt: on
 
+# A list of pgvector HNSW index builds to run.
+# Please do not alter the label for the query, as it is used to identify it.
+#
+# Disable auto formatting for the list of queries so that it's easier to read
+# fmt: off
+PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
+    LabelledQuery("PGV0",  r"DROP TABLE IF EXISTS hnsw_test_table;"),
+    LabelledQuery("PGV1",  r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"),
+    LabelledQuery("PGV2",  r"INSERT INTO hnsw_test_table SELECT * FROM documents;"),
+    LabelledQuery("PGV3",  r"CREATE INDEX ON hnsw_test_table (_id);"),
+    LabelledQuery("PGV4",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"),
+    LabelledQuery("PGV5",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"),
+    LabelledQuery("PGV6",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"),
+    LabelledQuery("PGV7",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"),
+    LabelledQuery("PGV8",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"),
+)
+# fmt: on
+
+
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
 
 
@@ -245,3 +264,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
     log.info("Collecting pg_stat_statements")
     query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
     run_psql(remote_compare, query, times=1, explain=False)
+
+
+@pytest.mark.parametrize("query", PGVECTOR_QUERIES)
+@pytest.mark.remote_cluster
+def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare):
+    """
+    An pgvector test that tests HNSW index build performance and parallelism.
+
+    The DB prepared manually in advance.
+    See
+    - test_runner/performance/pgvector/README.md
+    - test_runner/performance/pgvector/loaddata.py
+    - test_runner/performance/pgvector/HNSW_build.sql
+    """
+    run_psql(remote_compare, query, times=1, explain=False)
diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py
index 2b8760dff2d1..d756d6eeca5b 100644
--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -17,6 +17,7 @@ class PgBenchLoadType(enum.Enum):
     INIT = "init"
     SIMPLE_UPDATE = "simple-update"
     SELECT_ONLY = "select-only"
+    PGVECTOR_HNSW = "pgvector-hnsw"
 
 
 def utc_now_timestamp() -> int:
@@ -132,6 +133,26 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
             password=password,
         )
 
+    if workload_type == PgBenchLoadType.PGVECTOR_HNSW:
+        # Run simple-update workload
+        run_pgbench(
+            env,
+            "pgvector-hnsw",
+            [
+                "pgbench",
+                "-f",
+                "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql",
+                "-c100",
+                "-j20",
+                f"-T{duration}",
+                "-P2",
+                "--protocol=prepared",
+                "--progress-timestamp",
+                connstr,
+            ],
+            password=password,
+        )
+
     env.report_size()
 
 
@@ -201,3 +222,13 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur
 @pytest.mark.remote_cluster
 def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
     run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
+
+
+# The following test runs on an existing database that has pgvector extension installed
+# and a table with 1 million embedding vectors loaded and indexed with HNSW.
+#
+# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int):
+    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)

From f9f69a2ee7fb11b9d713bd3f2c50c5be516253c9 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 28 May 2024 16:21:09 +0200
Subject: [PATCH 04/31] clarify how to load the dbpedia vector embeddings into
 a postgres database (#7894)

## Problem


Improve the readme for the data load step in the pgvector performance
test.
---
 test_runner/performance/pgvector/README.md | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md
index c55db12e74b8..83495d270a0c 100644
--- a/test_runner/performance/pgvector/README.md
+++ b/test_runner/performance/pgvector/README.md
@@ -1,3 +1,20 @@
+# Source of the dataset for pgvector tests
+
+This readme was copied from https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
+
+## Download the parquet files
+
+```bash
+brew install git-lfs
+git-lfs clone https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M
+```
+
+## Load into postgres:
+
+see loaddata.py in this directory
+
+## Rest of dataset card as on huggingface
+
 ---
 dataset_info:
   features:
@@ -35,4 +52,4 @@ size_categories:
 - Created: February 2024. 
 - Text used for Embedding: title (string) + text (string)
 - Embedding Model: OpenAI text-embedding-3-large
-- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
\ No newline at end of file
+- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_
\ No newline at end of file

From 352b08d0be56c73ba9017a82cd6496aea7ba5758 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 28 May 2024 16:06:47 +0100
Subject: [PATCH 05/31] pageserver: fix a warning on secondary mode downloads
 after evictions (#7877)

## Problem

In 4ce6e2d2fc we added a warning when progress stats don't look right at
the end of a secondary download pass.

This `Correcting drift in progress stats` warning fired in staging on a
pageserver that had been doing some disk usage eviction.

The impact is low because in the same place we log the warning, we also
fix up the progress values.

## Summary of changes

- When we skip downloading a layer because it was recently evicted,
update the progress stats to ensure they still reach a clean complete
state at the end of a download pass.
- Also add a log for evicting secondary location layers, for symmetry
with attached locations, so that we can clearly see when eviction has
happened for a particular tenant's layers when investigating issues.

This is a point fix -- the code would also benefit from being refactored
so that there is some "download result" type with a Skip variant, to
ensure that we are updating the progress stats uniformly for those
cases.
---
 pageserver/src/tenant/secondary.rs            |  1 +
 pageserver/src/tenant/secondary/downloader.rs | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 252b6eb11b62..af6840f525ae 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -187,6 +187,7 @@ impl SecondaryTenant {
         };
 
         let now = SystemTime::now();
+        tracing::info!("Evicting secondary layer");
 
         let this = self.clone();
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 0ec1bd649b3e..5c915d6b53a2 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -909,6 +909,7 @@ impl<'a> TenantDownloader<'a> {
                         strftime(&layer.access_time),
                         strftime(evicted_at)
                     );
+                    self.skip_layer(layer);
                     continue;
                 }
             }
@@ -963,6 +964,15 @@ impl<'a> TenantDownloader<'a> {
         Ok(())
     }
 
+    /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
+    fn skip_layer(&self, layer: HeatMapLayer) {
+        let mut progress = self.secondary_state.progress.lock().unwrap();
+        progress.layers_total = progress.layers_total.saturating_sub(1);
+        progress.bytes_total = progress
+            .bytes_total
+            .saturating_sub(layer.metadata.file_size);
+    }
+
     async fn download_layer(
         &self,
         tenant_shard_id: &TenantShardId,
@@ -1012,13 +1022,7 @@ impl<'a> TenantDownloader<'a> {
                     "Skipped downloading missing layer {}, raced with compaction/gc?",
                     layer.name
                 );
-
-                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.layers_total = progress.layers_total.saturating_sub(1);
-                progress.bytes_total = progress
-                    .bytes_total
-                    .saturating_sub(layer.metadata.file_size);
+                self.skip_layer(layer);
 
                 return Ok(None);
             }

From 14df69d0e38c2ab3e1b8f1bef4a6981c842fd913 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 28 May 2024 17:40:52 +0200
Subject: [PATCH 06/31] Drop postgres-native-tls in favour of
 tokio-postgres-rustls (#7883)

Get rid of postgres-native-tls and openssl in favour of rustls in our
dependency tree.

Do further steps to completely remove native-tls and openssl.

Among other advantages, this allows us to do static musl builds more
easily: #7889
---
 Cargo.lock                                  | 154 ++------------------
 Cargo.toml                                  |  15 +-
 deny.toml                                   |   7 +
 proxy/Cargo.toml                            |   5 +-
 proxy/src/compute.rs                        |  84 +++++++++--
 s3_scrubber/Cargo.toml                      |   6 +-
 s3_scrubber/src/scan_safekeeper_metadata.rs |  20 ++-
 workspace_hack/Cargo.toml                   |   4 +-
 8 files changed, 124 insertions(+), 171 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d8f9021eb87d..b1a307dd1942 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -776,7 +776,6 @@ dependencies = [
  "pin-project",
  "serde",
  "time",
- "tz-rs",
  "url",
  "uuid",
 ]
@@ -1291,12 +1290,6 @@ dependencies = [
  "tiny-keccak",
 ]
 
-[[package]]
-name = "const_fn"
-version = "0.4.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935"
-
 [[package]]
 name = "const_format"
 version = "0.2.30"
@@ -1976,21 +1969,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foreign-types"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
-dependencies = [
- "foreign-types-shared",
-]
-
-[[package]]
-name = "foreign-types-shared"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
-
 [[package]]
 name = "form_urlencoded"
 version = "1.1.0"
@@ -2620,19 +2598,6 @@ dependencies = [
  "tokio-io-timeout",
 ]
 
-[[package]]
-name = "hyper-tls"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
-dependencies = [
- "bytes",
- "hyper 0.14.26",
- "native-tls",
- "tokio",
- "tokio-native-tls",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -3168,24 +3133,6 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
-[[package]]
-name = "native-tls"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
-dependencies = [
- "lazy_static",
- "libc",
- "log",
- "openssl",
- "openssl-probe",
- "openssl-sys",
- "schannel",
- "security-framework",
- "security-framework-sys",
- "tempfile",
-]
-
 [[package]]
 name = "nix"
 version = "0.25.1"
@@ -3356,15 +3303,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "num_threads"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "oauth2"
 version = "4.4.2"
@@ -3414,50 +3352,12 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
-[[package]]
-name = "openssl"
-version = "0.10.60"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800"
-dependencies = [
- "bitflags 2.4.1",
- "cfg-if",
- "foreign-types",
- "libc",
- "once_cell",
- "openssl-macros",
- "openssl-sys",
-]
-
-[[package]]
-name = "openssl-macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "openssl-probe"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
-[[package]]
-name = "openssl-sys"
-version = "0.9.96"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
- "vcpkg",
-]
-
 [[package]]
 name = "opentelemetry"
 version = "0.20.0"
@@ -4105,17 +4005,6 @@ dependencies = [
  "tokio-postgres",
 ]
 
-[[package]]
-name = "postgres-native-tls"
-version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
-dependencies = [
- "native-tls",
- "tokio",
- "tokio-native-tls",
- "tokio-postgres",
-]
-
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
@@ -4423,7 +4312,6 @@ dependencies = [
  "md5",
  "measured",
  "metrics",
- "native-tls",
  "once_cell",
  "opentelemetry",
  "parking_lot 0.12.1",
@@ -4431,7 +4319,6 @@ dependencies = [
  "parquet_derive",
  "pbkdf2",
  "pin-project-lite",
- "postgres-native-tls",
  "postgres-protocol",
  "postgres_backend",
  "pq_proto",
@@ -4450,6 +4337,7 @@ dependencies = [
  "rstest",
  "rustc-hash",
  "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4479,7 +4367,6 @@ dependencies = [
  "utils",
  "uuid",
  "walkdir",
- "webpki-roots 0.25.2",
  "workspace_hack",
  "x509-parser",
 ]
@@ -4786,20 +4673,21 @@ dependencies = [
  "http 0.2.9",
  "http-body 0.4.5",
  "hyper 0.14.26",
- "hyper-tls",
+ "hyper-rustls 0.24.0",
  "ipnet",
  "js-sys",
  "log",
  "mime",
- "native-tls",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
+ "rustls 0.21.11",
+ "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-native-tls",
+ "tokio-rustls 0.24.0",
  "tokio-util",
  "tower-service",
  "url",
@@ -4807,6 +4695,7 @@ dependencies = [
  "wasm-bindgen-futures",
  "wasm-streams 0.3.0",
  "web-sys",
+ "webpki-roots 0.25.2",
  "winreg 0.50.0",
 ]
 
@@ -5232,20 +5121,22 @@ dependencies = [
  "hex",
  "histogram",
  "itertools",
- "native-tls",
+ "once_cell",
  "pageserver",
  "pageserver_api",
- "postgres-native-tls",
  "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
  "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
  "serde",
  "serde_json",
  "serde_with",
  "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
  "tokio-stream",
  "tokio-util",
@@ -6189,8 +6080,6 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
  "itoa",
  "js-sys",
- "libc",
- "num_threads",
  "serde",
  "time-core",
  "time-macros",
@@ -6300,16 +6189,6 @@ dependencies = [
  "syn 2.0.52",
 ]
 
-[[package]]
-name = "tokio-native-tls"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
-dependencies = [
- "native-tls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
@@ -6716,15 +6595,6 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
 
-[[package]]
-name = "tz-rs"
-version = "0.6.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
-dependencies = [
- "const_fn",
-]
-
 [[package]]
 name = "uname"
 version = "0.1.1"
@@ -7629,9 +7499,9 @@ dependencies = [
 
 [[package]]
 name = "zeroize"
-version = "1.6.0"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 dependencies = [
  "zeroize_derive",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 0887c039f8bf..58715db32b2b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,10 +46,10 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.19"
-azure_identity = "0.19"
-azure_storage = "0.19"
-azure_storage_blobs = "0.19"
+azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -114,7 +114,6 @@ md5 = "0.7.0"
 measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
 memoffset = "0.8"
-native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
@@ -191,7 +190,7 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.25"
+rustls-native-certs = "0.7"
 x509-parser = "0.15"
 
 ## TODO replace this with tracing
@@ -200,7 +199,6 @@ log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -241,8 +239,7 @@ tonic-build = "0.9"
 
 [patch.crates-io]
 
-# This is only needed for proxy's tests.
-# TODO: we should probably fork `tokio-postgres-rustls` instead.
+# Needed to get `tokio-postgres-rustls` to depend on our fork.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
 # bug fixes for UUID
diff --git a/deny.toml b/deny.toml
index 22e39a2ca379..469609c496a6 100644
--- a/deny.toml
+++ b/deny.toml
@@ -99,6 +99,13 @@ name = "async-executor"
 [[bans.deny]]
 name = "smol"
 
+[[bans.deny]]
+# We want to use rustls instead of the platform's native tls implementation.
+name = "native-tls"
+
+[[bans.deny]]
+name = "openssl"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 7da0763bc18a..0b892e327768 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -82,6 +82,7 @@ thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -94,10 +95,8 @@ url.workspace = true
 urlencoding.workspace = true
 utils.workspace = true
 uuid.workspace = true
-webpki-roots.workspace = true
+rustls-native-certs.workspace = true
 x509-parser.workspace = true
-native-tls.workspace = true
-postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 4433b3c1c2dd..feb09d563896 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -10,11 +10,14 @@ use crate::{
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
+use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
-use std::{io, net::SocketAddr, time::Duration};
+use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
+use std::{io, net::SocketAddr, sync::Arc, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{error, info, warn};
 
 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -30,7 +33,7 @@ pub enum ConnectionError {
     CouldNotConnect(#[from] io::Error),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
-    TlsError(#[from] native_tls::Error),
+    TlsError(#[from] InvalidDnsNameError),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     WakeComputeError(#[from] WakeComputeError),
@@ -257,7 +260,7 @@ pub struct PostgresConnection {
     /// Socket connected to a compute node.
     pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
         tokio::net::TcpStream,
-        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
+        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
     >,
     /// PostgreSQL connection parameters.
     pub params: std::collections::HashMap<String, String>,
@@ -282,12 +285,23 @@ impl ConnCfg {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
         drop(pause);
 
-        let tls_connector = native_tls::TlsConnector::builder()
-            .danger_accept_invalid_certs(allow_self_signed_compute)
-            .build()
-            .unwrap();
-        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
-        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
+        let client_config = if allow_self_signed_compute {
+            // Allow all certificates for creating the connection
+            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
+            rustls::ClientConfig::builder()
+                .dangerous()
+                .with_custom_certificate_verifier(verifier)
+        } else {
+            let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
+            rustls::ClientConfig::builder().with_root_certificates(root_store)
+        };
+        let client_config = client_config.with_no_client_auth();
+
+        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
+            &mut mk_tls,
+            host,
+        )?;
 
         // connect_raw() will not use TLS if sslmode is "disable"
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
@@ -340,6 +354,58 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     Some(options)
 }
 
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
+    let der_certs = rustls_native_certs::load_native_certs()?;
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs);
+    Ok(Arc::new(store))
+}
+static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
+
+#[derive(Debug)]
+struct AcceptEverythingVerifier;
+impl ServerCertVerifier for AcceptEverythingVerifier {
+    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
+        use rustls::SignatureScheme::*;
+        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
+        vec![
+            ECDSA_NISTP521_SHA512,
+            ECDSA_NISTP384_SHA384,
+            ECDSA_NISTP256_SHA256,
+            RSA_PSS_SHA512,
+            RSA_PSS_SHA384,
+            RSA_PSS_SHA256,
+            ED25519,
+        ]
+    }
+    fn verify_server_cert(
+        &self,
+        _end_entity: &rustls::pki_types::CertificateDer<'_>,
+        _intermediates: &[rustls::pki_types::CertificateDer<'_>],
+        _server_name: &rustls::pki_types::ServerName<'_>,
+        _ocsp_response: &[u8],
+        _now: rustls::pki_types::UnixTime,
+    ) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
+        Ok(rustls::client::danger::ServerCertVerified::assertion())
+    }
+    fn verify_tls12_signature(
+        &self,
+        _message: &[u8],
+        _cert: &rustls::pki_types::CertificateDer<'_>,
+        _dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
+    }
+    fn verify_tls13_signature(
+        &self,
+        _message: &[u8],
+        _cert: &rustls::pki_types::CertificateDer<'_>,
+        _dss: &rustls::DigitallySignedStruct,
+    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
+        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index dd5d453a2bfa..e56bd43fb80b 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,8 +22,7 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
-native-tls.workspace = true
-postgres-native-tls.workspace = true
+tokio-postgres-rustls.workspace = true
 postgres_ffi.workspace = true
 tokio-stream.workspace = true
 tokio-postgres.workspace = true
@@ -31,6 +30,9 @@ tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
+rustls.workspace = true
+rustls-native-certs.workspace = true
+once_cell.workspace = true
 
 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/s3_scrubber/src/scan_safekeeper_metadata.rs
index 73dd49ceb5ff..24051b03de08 100644
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,7 +1,8 @@
-use std::{collections::HashSet, str::FromStr};
+use std::{collections::HashSet, str::FromStr, sync::Arc};
 
 use aws_sdk_s3::Client;
 use futures::stream::{StreamExt, TryStreamExt};
+use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use serde::Serialize;
@@ -70,9 +71,12 @@ pub async fn scan_safekeeper_metadata(
         "checking bucket {}, region {}, dump_db_table {}",
         bucket_config.bucket, bucket_config.region, dump_db_table
     );
-    // Use the native TLS implementation (Neon requires TLS)
-    let tls_connector =
-        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
+    // Use rustls (Neon requires TLS)
+    let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
+    let client_config = rustls::ClientConfig::builder()
+        .with_root_certificates(root_store)
+        .with_no_client_auth();
+    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
     let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
     // The connection object performs the actual communication with the database,
     // so spawn it off to run on its own.
@@ -234,3 +238,11 @@ async fn check_timeline(
         is_deleted: false,
     })
 }
+
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
+    let der_certs = rustls_native_certs::load_native_certs()?;
+    let mut store = rustls::RootCertStore::empty();
+    store.add_parsable_certificates(der_certs);
+    Ok(Arc::new(store))
+}
+static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f364a6c2e01a..df16c717893b 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -59,7 +59,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
-reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "default-tls", "stream"] }
+reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "rustls-tls", "stream"] }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
@@ -68,7 +68,7 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
-time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
+time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }

From c8cebecabf75149211866cd8e8f07ec061ccc2a5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 29 May 2024 11:17:05 +0100
Subject: [PATCH 07/31] proxy: reintroduce dynamic limiter for compute lock
 (#7737)

## Problem

Computes that are healthy can manage many connection attempts at a time.
Unhealthy computes cannot. We initially handled this with a fixed
concurrency limit, but it seems this inhibits pgbench.

## Summary of changes

Support AIMD for connect_to_compute lock to allow varying the
concurrency limit based on compute health
---
 Cargo.lock                                    |   1 +
 proxy/Cargo.toml                              |   1 +
 proxy/src/bin/proxy.rs                        |  17 +-
 proxy/src/config.rs                           |  67 ++++-
 proxy/src/console/provider.rs                 |  44 +--
 proxy/src/console/provider/neon.rs            |   2 +-
 proxy/src/proxy/connect_compute.rs            |   4 +-
 proxy/src/rate_limiter.rs                     |   4 +
 proxy/src/rate_limiter/limit_algorithm.rs     | 275 ++++++++++++++++++
 .../src/rate_limiter/limit_algorithm/aimd.rs  | 184 ++++++++++++
 proxy/src/serverless/backend.rs               |   4 +-
 11 files changed, 563 insertions(+), 40 deletions(-)
 create mode 100644 proxy/src/rate_limiter/limit_algorithm.rs
 create mode 100644 proxy/src/rate_limiter/limit_algorithm/aimd.rs

diff --git a/Cargo.lock b/Cargo.lock
index b1a307dd1942..794486e2e146 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4302,6 +4302,7 @@ dependencies = [
  "http 1.1.0",
  "http-body-util",
  "humantime",
+ "humantime-serde",
  "hyper 0.14.26",
  "hyper 1.2.0",
  "hyper-util",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0b892e327768..288f7769fef3 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -38,6 +38,7 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
+humantime-serde.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 30f2e6f4b7cb..dffebf55800c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -557,14 +557,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
             let config::ConcurrencyLockOptions {
                 shards,
-                permits,
+                limiter,
                 epoch,
                 timeout,
             } = args.wake_compute_lock.parse()?;
-            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
             let locks = Box::leak(Box::new(console::locks::ApiLocks::new(
                 "wake_compute_lock",
-                permits,
+                limiter,
                 shards,
                 timeout,
                 epoch,
@@ -603,14 +603,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
     let config::ConcurrencyLockOptions {
         shards,
-        permits,
+        limiter,
         epoch,
         timeout,
     } = args.connect_compute_lock.parse()?;
-    info!(permits, shards, ?epoch, "Using NodeLocks (connect_compute)");
+    info!(
+        ?limiter,
+        shards,
+        ?epoch,
+        "Using NodeLocks (connect_compute)"
+    );
     let connect_compute_locks = console::locks::ApiLocks::new(
         "connect_compute_lock",
-        permits,
+        limiter,
         shards,
         timeout,
         epoch,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5a0c251ce2d4..f4707a33aa79 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::{self, backend::AuthRateLimiter},
     console::locks::ApiLocks,
-    rate_limiter::RateBucketInfo,
+    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
     scram::threadpool::ThreadPool,
     serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
     Host,
@@ -580,14 +580,18 @@ impl RetryConfig {
 }
 
 /// Helper for cmdline cache options parsing.
+#[derive(serde::Deserialize)]
 pub struct ConcurrencyLockOptions {
     /// The number of shards the lock map should have
     pub shards: usize,
     /// The number of allowed concurrent requests for each endpoitn
-    pub permits: usize,
+    #[serde(flatten)]
+    pub limiter: RateLimiterConfig,
     /// Garbage collection epoch
+    #[serde(deserialize_with = "humantime_serde::deserialize")]
     pub epoch: Duration,
     /// Lock timeout
+    #[serde(deserialize_with = "humantime_serde::deserialize")]
     pub timeout: Duration,
 }
 
@@ -596,13 +600,18 @@ impl ConcurrencyLockOptions {
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
     /// Default options for [`crate::console::provider::ApiLocks`].
     pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
-        "shards=64,permits=10,epoch=10m,timeout=10ms";
+        "shards=64,permits=100,epoch=10m,timeout=10ms";
 
     // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
 
     /// Parse lock options passed via cmdline.
     /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
     fn parse(options: &str) -> anyhow::Result<Self> {
+        let options = options.trim();
+        if options.starts_with('{') && options.ends_with('}') {
+            return Ok(serde_json::from_str(options)?);
+        }
+
         let mut shards = None;
         let mut permits = None;
         let mut epoch = None;
@@ -629,9 +638,13 @@ impl ConcurrencyLockOptions {
             shards = Some(2);
         }
 
+        let permits = permits.context("missing `permits`")?;
         let out = Self {
             shards: shards.context("missing `shards`")?,
-            permits: permits.context("missing `permits`")?,
+            limiter: RateLimiterConfig {
+                algorithm: RateLimitAlgorithm::Fixed,
+                initial_limit: permits,
+            },
             epoch: epoch.context("missing `epoch`")?,
             timeout: timeout.context("missing `timeout`")?,
         };
@@ -657,6 +670,8 @@ impl FromStr for ConcurrencyLockOptions {
 
 #[cfg(test)]
 mod tests {
+    use crate::rate_limiter::Aimd;
+
     use super::*;
 
     #[test]
@@ -684,36 +699,68 @@ mod tests {
     fn test_parse_lock_options() -> anyhow::Result<()> {
         let ConcurrencyLockOptions {
             epoch,
-            permits,
+            limiter,
             shards,
             timeout,
         } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
         assert_eq!(epoch, Duration::from_secs(10 * 60));
         assert_eq!(timeout, Duration::from_secs(1));
         assert_eq!(shards, 32);
-        assert_eq!(permits, 4);
+        assert_eq!(limiter.initial_limit, 4);
+        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
 
         let ConcurrencyLockOptions {
             epoch,
-            permits,
+            limiter,
             shards,
             timeout,
         } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
         assert_eq!(epoch, Duration::from_secs(60));
         assert_eq!(timeout, Duration::from_millis(100));
         assert_eq!(shards, 16);
-        assert_eq!(permits, 8);
+        assert_eq!(limiter.initial_limit, 8);
+        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
 
         let ConcurrencyLockOptions {
             epoch,
-            permits,
+            limiter,
             shards,
             timeout,
         } = "permits=0".parse()?;
         assert_eq!(epoch, Duration::ZERO);
         assert_eq!(timeout, Duration::ZERO);
         assert_eq!(shards, 2);
-        assert_eq!(permits, 0);
+        assert_eq!(limiter.initial_limit, 0);
+        assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_parse_json_lock_options() -> anyhow::Result<()> {
+        let ConcurrencyLockOptions {
+            epoch,
+            limiter,
+            shards,
+            timeout,
+        } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"#
+            .parse()?;
+        assert_eq!(epoch, Duration::from_secs(10 * 60));
+        assert_eq!(timeout, Duration::from_secs(1));
+        assert_eq!(shards, 32);
+        assert_eq!(limiter.initial_limit, 44);
+        assert_eq!(
+            limiter.algorithm,
+            RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 5,
+                    max: 500,
+                    dec: 0.9,
+                    inc: 10,
+                    utilisation: 0.8
+                }
+            },
+        );
 
         Ok(())
     }
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 3b996cdbd11f..4d074f98a599 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -15,11 +15,11 @@ use crate::{
     error::ReportableError,
     intern::ProjectIdInt,
     metrics::ApiLockMetrics,
+    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
     scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
 use std::{hash::Hash, sync::Arc, time::Duration};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
 
@@ -443,8 +443,8 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks<K> {
     name: &'static str,
-    node_locks: DashMap<K, Arc<Semaphore>>,
-    permits: usize,
+    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    config: RateLimiterConfig,
     timeout: Duration,
     epoch: std::time::Duration,
     metrics: &'static ApiLockMetrics,
@@ -452,8 +452,6 @@ pub struct ApiLocks<K> {
 
 #[derive(Debug, thiserror::Error)]
 pub enum ApiLockError {
-    #[error("lock was closed")]
-    AcquireError(#[from] tokio::sync::AcquireError),
     #[error("permit could not be acquired")]
     TimeoutError(#[from] tokio::time::error::Elapsed),
 }
@@ -461,7 +459,6 @@ pub enum ApiLockError {
 impl ReportableError for ApiLockError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
             ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
         }
     }
@@ -470,7 +467,7 @@ impl ReportableError for ApiLockError {
 impl<K: Hash + Eq + Clone> ApiLocks<K> {
     pub fn new(
         name: &'static str,
-        permits: usize,
+        config: RateLimiterConfig,
         shards: usize,
         timeout: Duration,
         epoch: std::time::Duration,
@@ -479,7 +476,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
         Ok(Self {
             name,
             node_locks: DashMap::with_shard_amount(shards),
-            permits,
+            config,
             timeout,
             epoch,
             metrics,
@@ -487,8 +484,10 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
     }
 
     pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
-        if self.permits == 0 {
-            return Ok(WakeComputePermit { permit: None });
+        if self.config.initial_limit == 0 {
+            return Ok(WakeComputePermit {
+                permit: Token::disabled(),
+            });
         }
         let now = Instant::now();
         let semaphore = {
@@ -500,24 +499,22 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
                     .entry(key.clone())
                     .or_insert_with(|| {
                         self.metrics.semaphores_registered.inc();
-                        Arc::new(Semaphore::new(self.permits))
+                        DynamicLimiter::new(self.config)
                     })
                     .clone()
             }
         };
-        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
+        let permit = semaphore.acquire_deadline(now + self.timeout).await;
 
         self.metrics
             .semaphore_acquire_seconds
             .observe(now.elapsed().as_secs_f64());
 
-        Ok(WakeComputePermit {
-            permit: Some(permit??),
-        })
+        Ok(WakeComputePermit { permit: permit? })
     }
 
     pub async fn garbage_collect_worker(&self) {
-        if self.permits == 0 {
+        if self.config.initial_limit == 0 {
             return;
         }
         let mut interval =
@@ -547,12 +544,21 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
 }
 
 pub struct WakeComputePermit {
-    // None if the lock is disabled
-    permit: Option<OwnedSemaphorePermit>,
+    permit: Token,
 }
 
 impl WakeComputePermit {
     pub fn should_check_cache(&self) -> bool {
-        self.permit.is_some()
+        !self.permit.is_disabled()
+    }
+    pub fn release(self, outcome: Outcome) {
+        self.permit.release(outcome)
+    }
+    pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
+        match res {
+            Ok(_) => self.release(Outcome::Success),
+            Err(_) => self.release(Outcome::Overload),
+        }
+        res
     }
 }
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 7728d2cafa8c..5d691e5f1538 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -301,7 +301,7 @@ impl super::Api for Api {
             }
         }
 
-        let mut node = self.do_wake_compute(ctx, user_info).await?;
+        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
         ctx.set_project(node.aux.clone());
         let cold_start_info = node.aux.cold_start_info;
         info!("woken up a compute node");
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index c8528d029622..409d45b39a34 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -84,8 +84,8 @@ impl ConnectMechanism for TcpMechanism<'_> {
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
         let host = node_info.config.get_host()?;
-        let _permit = self.locks.get_permit(&host).await?;
-        node_info.connect(ctx, timeout).await
+        let permit = self.locks.get_permit(&host).await?;
+        permit.release_result(node_info.connect(ctx, timeout).await)
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index c54226754796..be9072dd8c2c 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -1,2 +1,6 @@
+mod limit_algorithm;
 mod limiter;
+pub use limit_algorithm::{
+    aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
+};
 pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
new file mode 100644
index 000000000000..072fdb80b002
--- /dev/null
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -0,0 +1,275 @@
+//! Algorithms for controlling concurrency limits.
+use parking_lot::Mutex;
+use std::{pin::pin, sync::Arc, time::Duration};
+use tokio::{
+    sync::Notify,
+    time::{error::Elapsed, timeout_at, Instant},
+};
+
+use self::aimd::Aimd;
+
+pub mod aimd;
+
+/// Whether a job succeeded or failed as a result of congestion/overload.
+///
+/// Errors not considered to be caused by overload should be ignored.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum Outcome {
+    /// The job succeeded, or failed in a way unrelated to overload.
+    Success,
+    /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
+    /// was observed.
+    Overload,
+}
+
+/// An algorithm for controlling a concurrency limit.
+pub trait LimitAlgorithm: Send + Sync + 'static {
+    /// Update the concurrency limit in response to a new job completion.
+    fn update(&self, old_limit: usize, sample: Sample) -> usize;
+}
+
+/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
+#[derive(Debug, Clone, PartialEq, Eq, Copy)]
+pub struct Sample {
+    pub(crate) latency: Duration,
+    /// Jobs in flight when the sample was taken.
+    pub(crate) in_flight: usize,
+    pub(crate) outcome: Outcome,
+}
+
+#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
+#[serde(rename_all = "snake_case")]
+pub enum RateLimitAlgorithm {
+    #[default]
+    Fixed,
+    Aimd {
+        #[serde(flatten)]
+        conf: Aimd,
+    },
+}
+
+pub struct Fixed;
+
+impl LimitAlgorithm for Fixed {
+    fn update(&self, old_limit: usize, _sample: Sample) -> usize {
+        old_limit
+    }
+}
+
+#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
+pub struct RateLimiterConfig {
+    #[serde(flatten)]
+    pub algorithm: RateLimitAlgorithm,
+    pub initial_limit: usize,
+}
+
+impl RateLimiterConfig {
+    pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
+        match self.algorithm {
+            RateLimitAlgorithm::Fixed => Box::new(Fixed),
+            RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
+        }
+    }
+}
+
+pub struct LimiterInner {
+    alg: Box<dyn LimitAlgorithm>,
+    available: usize,
+    limit: usize,
+    in_flight: usize,
+}
+
+impl LimiterInner {
+    fn update(&mut self, latency: Duration, outcome: Option<Outcome>) {
+        if let Some(outcome) = outcome {
+            let sample = Sample {
+                latency,
+                in_flight: self.in_flight,
+                outcome,
+            };
+            self.limit = self.alg.update(self.limit, sample);
+        }
+    }
+
+    fn take(&mut self, ready: &Notify) -> Option<()> {
+        if self.available > 1 {
+            self.available -= 1;
+            self.in_flight += 1;
+
+            // tell the next in the queue that there is a permit ready
+            if self.available > 1 {
+                ready.notify_one();
+            }
+            Some(())
+        } else {
+            None
+        }
+    }
+}
+
+/// Limits the number of concurrent jobs.
+///
+/// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the
+/// token once the job is finished.
+///
+/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
+/// caused by overload (loss).
+pub struct DynamicLimiter {
+    config: RateLimiterConfig,
+    inner: Mutex<LimiterInner>,
+    // to notify when a token is available
+    ready: Notify,
+}
+
+/// A concurrency token, required to run a job.
+///
+/// Release the token back to the [`DynamicLimiter`] after the job is complete.
+pub struct Token {
+    start: Instant,
+    limiter: Option<Arc<DynamicLimiter>>,
+}
+
+/// A snapshot of the state of the [`DynamicLimiter`].
+///
+/// Not guaranteed to be consistent under high concurrency.
+#[derive(Debug, Clone, Copy)]
+pub struct LimiterState {
+    limit: usize,
+    in_flight: usize,
+}
+
+impl DynamicLimiter {
+    /// Create a limiter with a given limit control algorithm.
+    pub fn new(config: RateLimiterConfig) -> Arc<Self> {
+        let ready = Notify::new();
+        ready.notify_one();
+
+        Arc::new(Self {
+            inner: Mutex::new(LimiterInner {
+                alg: config.create_rate_limit_algorithm(),
+                available: config.initial_limit,
+                limit: config.initial_limit,
+                in_flight: 0,
+            }),
+            ready,
+            config,
+        })
+    }
+
+    /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `duration`.
+    pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
+        self.acquire_deadline(Instant::now() + duration).await
+    }
+
+    /// Try to acquire a concurrency [Token], waiting until `deadline` if there are none available.
+    ///
+    /// Returns `None` if there are none available after `deadline`.
+    pub async fn acquire_deadline(self: &Arc<Self>, deadline: Instant) -> Result<Token, Elapsed> {
+        if self.config.initial_limit == 0 {
+            // If the rate limiter is disabled, we can always acquire a token.
+            Ok(Token::disabled())
+        } else {
+            let mut notified = pin!(self.ready.notified());
+            let mut ready = notified.as_mut().enable();
+            loop {
+                let mut limit = None;
+                if ready {
+                    let mut inner = self.inner.lock();
+                    if inner.take(&self.ready).is_some() {
+                        break Ok(Token::new(self.clone()));
+                    }
+                    limit = Some(inner.limit);
+                }
+                match timeout_at(deadline, notified.as_mut()).await {
+                    Ok(()) => ready = true,
+                    Err(e) => {
+                        let limit = limit.unwrap_or_else(|| self.inner.lock().limit);
+                        tracing::info!(limit, "could not acquire token in time");
+                        break Err(e);
+                    }
+                }
+            }
+        }
+    }
+
+    /// Return the concurrency [Token], along with the outcome of the job.
+    ///
+    /// The [Outcome] of the job, and the time taken to perform it, may be used
+    /// to update the concurrency limit.
+    ///
+    /// Set the outcome to `None` to ignore the job.
+    fn release_inner(&self, start: Instant, outcome: Option<Outcome>) {
+        tracing::info!("outcome is {:?}", outcome);
+        if self.config.initial_limit == 0 {
+            return;
+        }
+
+        let mut inner = self.inner.lock();
+
+        inner.update(start.elapsed(), outcome);
+        if inner.in_flight < inner.limit {
+            inner.available = inner.limit - inner.in_flight;
+            // At least 1 permit is now available
+            self.ready.notify_one();
+        }
+
+        inner.in_flight -= 1;
+    }
+
+    /// The current state of the limiter.
+    pub fn state(&self) -> LimiterState {
+        let inner = self.inner.lock();
+        LimiterState {
+            limit: inner.limit,
+            in_flight: inner.in_flight,
+        }
+    }
+}
+
+impl Token {
+    fn new(limiter: Arc<DynamicLimiter>) -> Self {
+        Self {
+            start: Instant::now(),
+            limiter: Some(limiter),
+        }
+    }
+    pub fn disabled() -> Self {
+        Self {
+            start: Instant::now(),
+            limiter: None,
+        }
+    }
+
+    pub fn is_disabled(&self) -> bool {
+        self.limiter.is_none()
+    }
+
+    pub fn release(mut self, outcome: Outcome) {
+        self.release_mut(Some(outcome))
+    }
+
+    pub fn release_mut(&mut self, outcome: Option<Outcome>) {
+        if let Some(limiter) = self.limiter.take() {
+            limiter.release_inner(self.start, outcome);
+        }
+    }
+}
+
+impl Drop for Token {
+    fn drop(&mut self) {
+        self.release_mut(None)
+    }
+}
+
+impl LimiterState {
+    /// The current concurrency limit.
+    pub fn limit(&self) -> usize {
+        self.limit
+    }
+    /// The number of jobs in flight.
+    pub fn in_flight(&self) -> usize {
+        self.in_flight
+    }
+}
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
new file mode 100644
index 000000000000..370d4be80256
--- /dev/null
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -0,0 +1,184 @@
+use std::usize;
+
+use super::{LimitAlgorithm, Outcome, Sample};
+
+/// Loss-based congestion avoidance.
+///
+/// Additive-increase, multiplicative decrease.
+///
+/// Adds available currency when:
+/// 1. no load-based errors are observed, and
+/// 2. the utilisation of the current limit is high.
+///
+/// Reduces available concurrency by a factor when load-based errors are detected.
+#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
+pub struct Aimd {
+    /// Minimum limit for AIMD algorithm.
+    pub min: usize,
+    /// Maximum limit for AIMD algorithm.
+    pub max: usize,
+    /// Decrease AIMD decrease by value in case of error.
+    pub dec: f32,
+    /// Increase AIMD increase by value in case of success.
+    pub inc: usize,
+    /// A threshold below which the limit won't be increased.
+    pub utilisation: f32,
+}
+
+impl LimitAlgorithm for Aimd {
+    fn update(&self, old_limit: usize, sample: Sample) -> usize {
+        use Outcome::*;
+        match sample.outcome {
+            Success => {
+                let utilisation = sample.in_flight as f32 / old_limit as f32;
+
+                if utilisation > self.utilisation {
+                    let limit = old_limit + self.inc;
+                    let increased_limit = limit.clamp(self.min, self.max);
+                    if increased_limit > old_limit {
+                        tracing::info!(increased_limit, "limit increased");
+                    }
+
+                    increased_limit
+                } else {
+                    old_limit
+                }
+            }
+            Overload => {
+                let limit = old_limit as f32 * self.dec;
+
+                // Floor instead of round, so the limit reduces even with small numbers.
+                // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
+                let limit = limit.floor() as usize;
+
+                limit.clamp(self.min, self.max)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use crate::rate_limiter::limit_algorithm::{
+        DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig,
+    };
+
+    use super::*;
+
+    #[tokio::test(start_paused = true)]
+    async fn should_decrease_limit_on_overload() {
+        let config = RateLimiterConfig {
+            initial_limit: 10,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.8,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        token.release(Outcome::Overload);
+
+        assert_eq!(limiter.state().limit(), 5, "overload: decrease");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
+        let config = RateLimiterConfig {
+            initial_limit: 4,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 1,
+                    dec: 0.5,
+                    utilisation: 0.5,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        let _token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        let _token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+
+        token.release(Outcome::Success);
+        assert_eq!(limiter.state().limit(), 5, "success: increase");
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
+        let config = RateLimiterConfig {
+            initial_limit: 4,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.5,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+
+        token.release(Outcome::Success);
+        assert_eq!(
+            limiter.state().limit(),
+            4,
+            "success: ignore when < half limit"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn should_not_change_limit_when_no_outcome() {
+        let config = RateLimiterConfig {
+            initial_limit: 10,
+            algorithm: RateLimitAlgorithm::Aimd {
+                conf: Aimd {
+                    min: 1,
+                    max: 1500,
+                    inc: 10,
+                    dec: 0.5,
+                    utilisation: 0.5,
+                },
+            },
+        };
+
+        let limiter = DynamicLimiter::new(config);
+
+        let token = limiter
+            .acquire_timeout(Duration::from_millis(1))
+            .await
+            .unwrap();
+        drop(token);
+        assert_eq!(limiter.state().limit(), 10, "ignore");
+    }
+}
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 52fc7b556a15..a40c66a80d3f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -232,9 +232,9 @@ impl ConnectMechanism for TokioMechanism {
             .connect_timeout(timeout);
 
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
-        drop(permit);
+        let (client, connection) = permit.release_result(res)?;
 
         tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
         Ok(poll_client(

From 7ac11d39421330b64b8dfa72b83439d51c05da0b Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 29 May 2024 22:18:09 +0300
Subject: [PATCH 08/31] Do not produce error if gin page is not restored in
 redo (#7876)

## Problem

See https://github.com/neondatabase/cloud/issues/10845

## Summary of changes

Do not report error if GIN page is not restored

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 test_runner/regress/test_gin_redo.py | 22 ++++++++++++++++++++++
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 vendor/postgres-v16                  |  2 +-
 vendor/revisions.json                |  6 +++---
 5 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_gin_redo.py

diff --git a/test_runner/regress/test_gin_redo.py b/test_runner/regress/test_gin_redo.py
new file mode 100644
index 000000000000..9205882239b9
--- /dev/null
+++ b/test_runner/regress/test_gin_redo.py
@@ -0,0 +1,22 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
+
+
+#
+# Test that redo of XLOG_GIN_VACUUM_PAGE doesn't produce error
+#
+def test_gin_redo(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    time.sleep(1)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    con = primary.connect()
+    cur = con.cursor()
+    cur.execute("create table gin_test_tbl(id integer, i int4[])")
+    cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)")
+    cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g")
+    cur.execute("delete from gin_test_tbl where id % 2 = 0")
+    cur.execute("vacuum gin_test_tbl")
+    wait_replica_caughtup(primary, secondary)
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 0d30e28f74f4..17e0f5ff4e19 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f
+Subproject commit 17e0f5ff4e1905691aa40e1e08f9b79b14c99652
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 74fb144890c4..c2c3d40534db 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d
+Subproject commit c2c3d40534db97d83dd7e185d1971e707fa2f445
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 3c2b9d576c58..b228f20372eb 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2
+Subproject commit b228f20372ebcabfd7946647cb7adbd38bacb14a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 2f16f334c561..5bf4e289ef92 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"],
-  "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"],
-  "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"]
+  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
+  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
+  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
 }

From b0a954bde237f424381a76af5ffd046a9b0e85a7 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Thu, 30 May 2024 08:25:10 +0200
Subject: [PATCH 09/31] CI: switch ubuntu-latest with ubuntu-22.04 (#7256)
 (#7901)

## Problem
We use ubuntu-latest as a default OS for running jobs. It can cause
problems due to instability, so we should use the LTS version of Ubuntu.

## Summary of changes
The image ubuntu-latest was changed with ubuntu-22.04 in workflows.

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/actionlint.yml                 | 14 +++++++++++++-
 .github/workflows/approved-for-ci-run.yml        |  6 +++---
 .github/workflows/benchmarking.yml               |  2 +-
 .github/workflows/build-build-tools-image.yml    |  2 +-
 .github/workflows/build_and_test.yml             | 12 ++++++------
 .github/workflows/check-build-tools-image.yml    |  2 +-
 .github/workflows/check-permissions.yml          |  2 +-
 .github/workflows/cleanup-caches-by-a-branch.yml |  2 +-
 .github/workflows/pg_clients.yml                 |  2 +-
 .github/workflows/pin-build-tools-image.yml      |  2 +-
 .github/workflows/release-notify.yml             |  2 +-
 .github/workflows/release.yml                    |  4 ++--
 .github/workflows/trigger-e2e-tests.yml          |  6 +++---
 13 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index f2736614bf80..078c7f88c460 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -24,7 +24,7 @@ jobs:
 
   actionlint:
     needs: [ check-permissions ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
       - uses: reviewdog/action-actionlint@v1
@@ -36,3 +36,15 @@ jobs:
           fail_on_error: true
           filter_mode: nofilter
           level: error
+      - run: |
+          PAT='^\s*runs-on:.*-latest'
+          if grep -ERq $PAT .github/workflows
+          then
+            grep -ERl $PAT .github/workflows |\
+            while read -r f
+            do
+              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
+              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
+            done
+            exit 1
+          fi
diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml
index ab616d17e2c1..b14b66a439fb 100644
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -44,7 +44,7 @@ jobs:
       contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
       contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -60,7 +60,7 @@ jobs:
       github.event.action == 'labeled' &&
       contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
@@ -109,7 +109,7 @@ jobs:
       github.event.action == 'closed' &&
       github.event.pull_request.head.repo.full_name != github.repository
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index d5a375d70429..57d24063bfb3 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -137,7 +137,7 @@ jobs:
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
     env:
       RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
       olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }}
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index bdf00bcaae1b..9aacb09d10c2 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -88,7 +88,7 @@ jobs:
 
   merge-images:
     needs: [ build-image ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f8c011a0a579..b9caf7606020 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -35,7 +35,7 @@ jobs:
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
     if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Cancel previous e2e-tests runs for this PR
@@ -549,7 +549,7 @@ jobs:
   report-benchmarks-failures:
     needs: [ benchmarks, create-test-report ]
     if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: slackapi/slack-github-action@v1
@@ -774,7 +774,7 @@ jobs:
 
   neon-image:
     needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: docker/login-action@v3
@@ -884,7 +884,7 @@ jobs:
 
   compute-node-image:
     needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     strategy:
       matrix:
@@ -1032,7 +1032,7 @@ jobs:
 
   promote-images:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     env:
       VERSIONS: v14 v15 v16
@@ -1077,7 +1077,7 @@ jobs:
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Set PR's status to pending and request a remote CI test
         run: |
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index a1e22cf93f6e..97116940a005 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -19,7 +19,7 @@ permissions: {}
 
 jobs:
   check-image:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     outputs:
       tag: ${{ steps.get-build-tools-tag.outputs.image-tag }}
       found: ${{ steps.check-image.outputs.found }}
diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml
index c3357c6cf8e3..9c427947972f 100644
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -16,7 +16,7 @@ permissions: {}
 
 jobs:
   check-permissions:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - name: Disallow CI runs on PRs from forks
       if: |
diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml
index d8c225dedb10..0c074e36dc14 100644
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   cleanup:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Cleanup
         run: |
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
index 50e3227a7450..fef3aec754b2 100644
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -20,7 +20,7 @@ concurrency:
 jobs:
   test-postgres-client-libs:
     # TODO: switch to gen2 runner, requires docker
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
 
     env:
       DEFAULT_PG_VERSION: 14
diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml
index d495a158e88d..024594532fbe 100644
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -26,7 +26,7 @@ permissions: {}
 
 jobs:
   tag-image:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     env:
       FROM_TAG: ${{ inputs.from-tag }}
diff --git a/.github/workflows/release-notify.yml b/.github/workflows/release-notify.yml
index ba396dba74b8..8bd10e993c64 100644
--- a/.github/workflows/release-notify.yml
+++ b/.github/workflows/release-notify.yml
@@ -19,7 +19,7 @@ on:
 
 jobs:
   notify:
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: neondatabase/dev-actions/release-pr-notify@main
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index fe24f6330ee3..90a3aaaf2ddf 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,7 +26,7 @@ defaults:
 jobs:
   create-storage-release-branch:
     if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     permissions:
       contents: write # for `git push`
@@ -65,7 +65,7 @@ jobs:
 
   create-proxy-release-branch:
     if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     permissions:
       contents: write # for `git push`
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 7111ee37faa5..77928a343e5e 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -19,7 +19,7 @@ env:
 jobs:
   cancel-previous-e2e-tests:
     if: github.event_name == 'pull_request'
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - name: Cancel previous e2e-tests runs for this PR
@@ -31,7 +31,7 @@ jobs:
               --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
 
   tag:
-    runs-on: [ ubuntu-latest ]
+    runs-on: ubuntu-22.04
     outputs:
       build-tag: ${{ steps.build-tag.outputs.tag }}
 
@@ -62,7 +62,7 @@ jobs:
 
   trigger-e2e-tests:
     needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       TAG: ${{ needs.tag.outputs.build-tag }}
     steps:

From 238fa47bee911d23730dc1e8e91defb0bf57dda9 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 May 2024 11:09:27 +0100
Subject: [PATCH 10/31] proxy fix wake compute rate limit (#7902)

## Problem

We were rate limiting wake_compute in the wrong place

## Summary of changes

Move wake_compute rate limit to after the permit is acquired. Also makes
a slight refactor on normalize, as it caught my eye
---
 proxy/src/auth/backend.rs          |  2 +-
 proxy/src/console/provider/neon.rs | 19 ++++++++++---------
 proxy/src/lib.rs                   | 21 ++++++++++++---------
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3555eba54345..f757a15fbb9b 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -35,7 +35,7 @@ use crate::{
     },
     stream, url,
 };
-use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 5d691e5f1538..d72229b02934 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -13,7 +13,7 @@ use crate::{
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
-    scram, EndpointCacheKey, Normalize,
+    scram, EndpointCacheKey,
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
@@ -281,14 +281,6 @@ impl super::Api for Api {
             return Ok(cached);
         }
 
-        // check rate limit
-        if !self
-            .wake_compute_endpoint_rate_limiter
-            .check(user_info.endpoint.normalize().into(), 1)
-        {
-            return Err(WakeComputeError::TooManyConnections);
-        }
-
         let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
@@ -301,6 +293,15 @@ impl super::Api for Api {
             }
         }
 
+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            info!(key = &*key, "found cached compute node info");
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
         let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
         ctx.set_project(node.aux.clone());
         let cold_start_info = node.aux.cold_start_info;
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 35c1616481ee..ea92eaaa5583 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -3,6 +3,7 @@
 use std::convert::Infallible;
 
 use anyhow::{bail, Context};
+use intern::{EndpointIdInt, EndpointIdTag, InternId};
 use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -129,20 +130,22 @@ macro_rules! smol_str_wrapper {
 
 const POOLER_SUFFIX: &str = "-pooler";
 
-pub trait Normalize {
-    fn normalize(&self) -> Self;
-}
-
-impl<S: Clone + AsRef<str> + From<String>> Normalize for S {
+impl EndpointId {
     fn normalize(&self) -> Self {
-        if self.as_ref().ends_with(POOLER_SUFFIX) {
-            let mut s = self.as_ref().to_string();
-            s.truncate(s.len() - POOLER_SUFFIX.len());
-            s.into()
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            stripped.into()
         } else {
             self.clone()
         }
     }
+
+    fn normalize_intern(&self) -> EndpointIdInt {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            EndpointIdTag::get_interner().get_or_intern(stripped)
+        } else {
+            self.into()
+        }
+    }
 }
 
 // 90% of role name strings are 20 characters or less.

From fddd11dd1a4d86623d1683ecc7f9f679a5132f89 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 May 2024 11:10:27 +0100
Subject: [PATCH 11/31] proxy: upload postgres connection options as json in
 the parquet upload (#7903)

## Problem

https://github.com/neondatabase/cloud/issues/9943

## Summary of changes

Captures the postgres options, converts them to json, uploads them in
parquet.
---
 libs/pq_proto/src/lib.rs              |  7 ++-
 proxy/src/auth/backend/link.rs        |  1 +
 proxy/src/auth/credentials.rs         |  8 ---
 proxy/src/context.rs                  | 21 ++++++-
 proxy/src/context/parquet.rs          | 83 +++++++++++++++++----------
 proxy/src/proxy.rs                    |  2 +
 proxy/src/serverless/sql_over_http.rs | 13 +++--
 7 files changed, 89 insertions(+), 46 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 522b65f5d10f..f8e578c6f292 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -50,12 +50,17 @@ pub enum FeStartupPacket {
     },
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
     params: HashMap<String, String>,
 }
 
 impl StartupMessageParams {
+    /// Set parameter's value by its name.
+    pub fn insert(&mut self, name: &str, value: &str) {
+        self.params.insert(name.to_owned(), value.to_owned());
+    }
+
     /// Get parameter's value by its name.
     pub fn get(&self, name: &str) -> Option<&str> {
         self.params.get(name).map(|s| s.as_str())
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 415a4b7d856d..5932e1337c80 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -100,6 +100,7 @@ pub(super) async fn authenticate(
         .dbname(&db_info.dbname)
         .user(&db_info.user);
 
+    ctx.set_dbname(db_info.dbname.into());
     ctx.set_user(db_info.user.into());
     ctx.set_project(db_info.aux.clone());
     info!("woken up a compute node");
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 783a1a5a2169..d06f5614f1fc 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,7 +11,6 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use smol_str::SmolStr;
 use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
@@ -96,13 +95,6 @@ impl ComputeUserInfoMaybeEndpoint {
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
         let user: RoleName = get_param("user")?.into();
 
-        // record the values if we have them
-        ctx.set_application(params.get("application_name").map(SmolStr::from));
-        ctx.set_user(user.clone());
-        if let Some(dbname) = params.get("database") {
-            ctx.set_dbname(dbname.into());
-        }
-
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
             .options_raw()
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index dfd3ef108ef2..ff79ba827573 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -2,6 +2,7 @@
 
 use chrono::Utc;
 use once_cell::sync::OnceCell;
+use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
@@ -46,6 +47,7 @@ pub struct RequestMonitoring {
     pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
     pub(crate) cold_start_info: ColdStartInfo,
+    pg_options: Option<StartupMessageParams>,
 
     // extra
     // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -102,6 +104,7 @@ impl RequestMonitoring {
             success: false,
             rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
+            pg_options: None,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
             disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
@@ -132,6 +135,18 @@ impl RequestMonitoring {
         self.latency_timer.cold_start_info(info);
     }
 
+    pub fn set_db_options(&mut self, options: StartupMessageParams) {
+        self.set_application(options.get("application_name").map(SmolStr::from));
+        if let Some(user) = options.get("user") {
+            self.set_user(user.into());
+        }
+        if let Some(dbname) = options.get("database") {
+            self.set_dbname(dbname.into());
+        }
+
+        self.pg_options = Some(options);
+    }
+
     pub fn set_project(&mut self, x: MetricsAuxInfo) {
         if self.endpoint_id.is_none() {
             self.set_endpoint_id(x.endpoint_id.as_str().into())
@@ -155,8 +170,10 @@ impl RequestMonitoring {
         }
     }
 
-    pub fn set_application(&mut self, app: Option<SmolStr>) {
-        self.application = app.or_else(|| self.application.clone());
+    fn set_application(&mut self, app: Option<SmolStr>) {
+        if let Some(app) = app {
+            self.application = Some(app);
+        }
     }
 
     pub fn set_dbname(&mut self, dbname: DbName) {
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index a213a32ca41d..1355b7e1d86b 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,7 +13,9 @@ use parquet::{
     },
     record::RecordWriter,
 };
+use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -87,6 +89,7 @@ pub struct RequestData {
     database: Option<String>,
     project: Option<String>,
     branch: Option<String>,
+    pg_options: Option<String>,
     auth_method: Option<&'static str>,
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
@@ -101,6 +104,23 @@ pub struct RequestData {
     disconnect_timestamp: Option<chrono::NaiveDateTime>,
 }
 
+struct Options<'a> {
+    options: &'a StartupMessageParams,
+}
+
+impl<'a> serde::Serialize for Options<'a> {
+    fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let mut state = s.serialize_map(None)?;
+        for (k, v) in self.options.iter() {
+            state.serialize_entry(k, v)?;
+        }
+        state.end()
+    }
+}
+
 impl From<&RequestMonitoring> for RequestData {
     fn from(value: &RequestMonitoring) -> Self {
         Self {
@@ -113,6 +133,10 @@ impl From<&RequestMonitoring> for RequestData {
             database: value.dbname.as_deref().map(String::from),
             project: value.project.as_deref().map(String::from),
             branch: value.branch.as_deref().map(String::from),
+            pg_options: value
+                .pg_options
+                .as_ref()
+                .and_then(|options| serde_json::to_string(&Options { options }).ok()),
             auth_method: value.auth_method.as_ref().map(|x| match x {
                 super::AuthMethod::Web => "web",
                 super::AuthMethod::ScramSha256 => "scram_sha_256",
@@ -494,6 +518,7 @@ mod tests {
             database: Some(hex::encode(rng.gen::<[u8; 16]>())),
             project: Some(hex::encode(rng.gen::<[u8; 16]>())),
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            pg_options: None,
             auth_method: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
@@ -570,15 +595,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315314, 3, 6000),
-                (1315307, 3, 6000),
-                (1315367, 3, 6000),
-                (1315324, 3, 6000),
-                (1315454, 3, 6000),
-                (1315296, 3, 6000),
-                (1315088, 3, 6000),
-                (1315324, 3, 6000),
-                (438713, 1, 2000)
+                (1315874, 3, 6000),
+                (1315867, 3, 6000),
+                (1315927, 3, 6000),
+                (1315884, 3, 6000),
+                (1316014, 3, 6000),
+                (1315856, 3, 6000),
+                (1315648, 3, 6000),
+                (1315884, 3, 6000),
+                (438913, 1, 2000)
             ]
         );
 
@@ -608,11 +633,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1222212, 5, 10000),
-                (1228362, 5, 10000),
-                (1230156, 5, 10000),
-                (1229518, 5, 10000),
-                (1220796, 5, 10000)
+                (1223214, 5, 10000),
+                (1229364, 5, 10000),
+                (1231158, 5, 10000),
+                (1230520, 5, 10000),
+                (1221798, 5, 10000)
             ]
         );
 
@@ -644,11 +669,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1207859, 5, 10000),
-                (1207590, 5, 10000),
-                (1207883, 5, 10000),
-                (1207871, 5, 10000),
-                (1208126, 5, 10000)
+                (1208861, 5, 10000),
+                (1208592, 5, 10000),
+                (1208885, 5, 10000),
+                (1208873, 5, 10000),
+                (1209128, 5, 10000)
             ]
         );
 
@@ -673,15 +698,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315314, 3, 6000),
-                (1315307, 3, 6000),
-                (1315367, 3, 6000),
-                (1315324, 3, 6000),
-                (1315454, 3, 6000),
-                (1315296, 3, 6000),
-                (1315088, 3, 6000),
-                (1315324, 3, 6000),
-                (438713, 1, 2000)
+                (1315874, 3, 6000),
+                (1315867, 3, 6000),
+                (1315927, 3, 6000),
+                (1315884, 3, 6000),
+                (1316014, 3, 6000),
+                (1315856, 3, 6000),
+                (1315648, 3, 6000),
+                (1315884, 3, 6000),
+                (438913, 1, 2000)
             ]
         );
 
@@ -718,7 +743,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
+            [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5824b70df91a..95b46ae002fe 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -267,6 +267,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         };
     drop(pause);
 
+    ctx.set_db_options(params.clone());
+
     let hostname = mode.hostname(stream.get_ref());
 
     let common_names = tls.map(|tls| &tls.common_names);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 5376bddfd314..9a7cdc857780 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,6 +17,7 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
+use pq_proto::StartupMessageParams;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -192,13 +193,13 @@ fn get_conn_info(
 
     let mut options = Option::None;
 
+    let mut params = StartupMessageParams::default();
+    params.insert("user", &username);
+    params.insert("database", &dbname);
     for (key, value) in pairs {
-        match &*key {
-            "options" => {
-                options = Some(NeonOptions::parse_options_raw(&value));
-            }
-            "application_name" => ctx.set_application(Some(value.into())),
-            _ => {}
+        params.insert(&key, &value);
+        if key == "options" {
+            options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
 

From 9a081c230f6b1d2bff7fea1e201631a7e7ee4328 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 30 May 2024 12:02:38 +0100
Subject: [PATCH 12/31] proxy: lazily parse startup pg params (#7905)

## Problem

proxy params being a `HashMap<String,String>` when it contains just
```
application_name: psql
database: neondb
user: neondb_owner
```
is quite wasteful allocation wise.

## Summary of changes

Keep the params in the wire protocol form, eg:
```
application_name\0psql\0database\0neondb\0user\0neondb_owner\0
```

Using a linear search for the map is fast enough at small sizes, which
is the normal case.
---
 Cargo.lock                            |  1 +
 libs/pq_proto/Cargo.toml              |  1 +
 libs/pq_proto/src/lib.rs              | 76 +++++++++++++++------------
 proxy/src/serverless/sql_over_http.rs |  4 +-
 4 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 794486e2e146..44edbabaf6ad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4113,6 +4113,7 @@ version = "0.1.0"
 dependencies = [
  "byteorder",
  "bytes",
+ "itertools",
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index 6eeb3bafef86..8afabe670ee0 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 bytes.workspace = true
 byteorder.workspace = true
+itertools.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index f8e578c6f292..cee374201763 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,8 +7,9 @@ pub mod framed;
 
 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use itertools::Itertools;
 use serde::{Deserialize, Serialize};
-use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+use std::{borrow::Cow, fmt, io, str};
 
 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -51,19 +52,36 @@ pub enum FeStartupPacket {
 }
 
 #[derive(Debug, Clone, Default)]
-pub struct StartupMessageParams {
-    params: HashMap<String, String>,
+pub struct StartupMessageParamsBuilder {
+    params: BytesMut,
 }
 
-impl StartupMessageParams {
+impl StartupMessageParamsBuilder {
     /// Set parameter's value by its name.
+    /// name and value must not contain a \0 byte
     pub fn insert(&mut self, name: &str, value: &str) {
-        self.params.insert(name.to_owned(), value.to_owned());
+        self.params.put(name.as_bytes());
+        self.params.put(&b"\0"[..]);
+        self.params.put(value.as_bytes());
+        self.params.put(&b"\0"[..]);
     }
 
+    pub fn freeze(self) -> StartupMessageParams {
+        StartupMessageParams {
+            params: self.params.freeze(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct StartupMessageParams {
+    params: Bytes,
+}
+
+impl StartupMessageParams {
     /// Get parameter's value by its name.
     pub fn get(&self, name: &str) -> Option<&str> {
-        self.params.get(name).map(|s| s.as_str())
+        self.iter().find_map(|(k, v)| (k == name).then_some(v))
     }
 
     /// Split command-line options according to PostgreSQL's logic,
@@ -117,15 +135,19 @@ impl StartupMessageParams {
 
     /// Iterate through key-value pairs in an arbitrary order.
     pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
+        let params =
+            std::str::from_utf8(&self.params).expect("should be validated as utf8 already");
+        params.split_terminator('\0').tuples()
     }
 
     // This function is mostly useful in tests.
     #[doc(hidden)]
     pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        Self {
-            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
+        let mut b = StartupMessageParamsBuilder::default();
+        for (k, v) in pairs {
+            b.insert(k, v)
         }
+        b.freeze()
     }
 }
 
@@ -350,35 +372,21 @@ impl FeStartupPacket {
             (major_version, minor_version) => {
                 // StartupMessage
 
-                // Parse pairs of null-terminated strings (key, value).
-                // See `postgres: ProcessStartupPacket, build_startup_packet`.
-                let mut tokens = str::from_utf8(&msg)
-                    .map_err(|_e| {
-                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
-                    })?
-                    .strip_suffix('\0') // drop packet's own null
-                    .ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: missing null terminator".to_string(),
-                        )
-                    })?
-                    .split_terminator('\0');
-
-                let mut params = HashMap::new();
-                while let Some(name) = tokens.next() {
-                    let value = tokens.next().ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: key without value".to_string(),
-                        )
-                    })?;
-
-                    params.insert(name.to_owned(), value.to_owned());
-                }
+                let s = str::from_utf8(&msg).map_err(|_e| {
+                    ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                })?;
+                let s = s.strip_suffix('\0').ok_or_else(|| {
+                    ProtocolError::Protocol(
+                        "StartupMessage params: missing null terminator".to_string(),
+                    )
+                })?;
 
                 FeStartupPacket::StartupMessage {
                     major_version,
                     minor_version,
-                    params: StartupMessageParams { params },
+                    params: StartupMessageParams {
+                        params: msg.slice_ref(s.as_bytes()),
+                    },
                 }
             }
         };
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9a7cdc857780..9d6a475aebf3 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -17,7 +17,7 @@ use hyper1::http::HeaderValue;
 use hyper1::Response;
 use hyper1::StatusCode;
 use hyper1::{HeaderMap, Request};
-use pq_proto::StartupMessageParams;
+use pq_proto::StartupMessageParamsBuilder;
 use serde_json::json;
 use serde_json::Value;
 use tokio::time;
@@ -193,7 +193,7 @@ fn get_conn_info(
 
     let mut options = Option::None;
 
-    let mut params = StartupMessageParams::default();
+    let mut params = StartupMessageParamsBuilder::default();
     params.insert("user", &username);
     params.insert("database", &dbname);
     for (key, value) in pairs {

From 167394a0735abb422e0f2b544af4b160edbd2f34 Mon Sep 17 00:00:00 2001
From: YukiSeino <36467282+SeinoYuki@users.noreply.github.com>
Date: Thu, 30 May 2024 22:58:20 +0900
Subject: [PATCH 13/31] refacter : VirtualFile::open uses AsRef (#7908)

## Problem
#7371

## Summary of changes
* The VirtualFile::open, open_with_options, and create methods use
AsRef, similar to the standard library's std::fs APIs.
---
 pageserver/src/virtual_file.rs | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b68f3a0e8931..04d9386fab92 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,21 +344,21 @@ macro_rules! with_file {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
-    pub async fn open(
-        path: &Utf8Path,
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
+        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
     }
 
     /// Create a new file for writing. If the file exists, it will be truncated.
     /// Like File::create.
-    pub async fn create(
-        path: &Utf8Path,
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
         ctx: &RequestContext,
     ) -> Result<VirtualFile, std::io::Error> {
         Self::open_with_options(
-            path,
+            path.as_ref(),
             OpenOptions::new().write(true).create(true).truncate(true),
             ctx,
         )
@@ -370,12 +370,13 @@ impl VirtualFile {
     /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
     /// they will be applied also when the file is subsequently re-opened, not only
     /// on the first time. Make sure that's sane!
-    pub async fn open_with_options(
-        path: &Utf8Path,
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
         open_options: &OpenOptions,
         _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     ) -> Result<VirtualFile, std::io::Error> {
-        let path_str = path.to_string();
+        let path_ref = path.as_ref();
+        let path_str = path_ref.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
         let (tenant_id, shard_id, timeline_id) =
             if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
@@ -401,7 +402,7 @@ impl VirtualFile {
         // where our caller doesn't get to use the returned VirtualFile before its
         // slot gets re-used by someone else.
         let file = observe_duration!(StorageIoOperation::Open, {
-            open_options.open(path.as_std_path()).await?
+            open_options.open(path_ref.as_std_path()).await?
         });
 
         // Strip all options other than read and write.
@@ -417,7 +418,7 @@ impl VirtualFile {
         let vfile = VirtualFile {
             handle: RwLock::new(handle),
             pos: 0,
-            path: path.to_path_buf(),
+            path: path_ref.to_path_buf(),
             open_options: reopen_options,
             tenant_id,
             shard_id,

From 1eca8b8a6b56e82445d9d8354e7acfee97c80603 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 30 May 2024 10:03:17 -0400
Subject: [PATCH 14/31] fix(pageserver): ensure to_i128 works for metadata keys
 (#7895)

field2 of metadata keys can be 0xFFFF because of the mapping. Allow
0xFFFF for `to_i128`. An alternative is to encode 0xFFFF as 0xFFFFFFFF
(which is allowed in the original `to_i128`). But checking the places
where field2 is referenced, the rest part of the system does not seem to
depend on this assertion.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/key.rs | 37 +++++++++++-----------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 2511de00d59c..b00d48498ca5 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,5 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -53,14 +52,8 @@ impl Key {
     /// Encode a metadata key to a storage key.
     pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
         assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
+        // Metadata key space ends at 0x7F so it's fine to directly convert it to i128.
+        Self::from_i128(i128::from_be_bytes(*key))
     }
 
     /// Encode a metadata key to a storage key.
@@ -68,17 +61,6 @@ impl Key {
         Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
     }
 
-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
     /// Get the range of metadata keys.
     pub const fn metadata_key_range() -> Range<Self> {
         Key {
@@ -121,7 +103,7 @@ impl Key {
     /// As long as Neon does not support tablespace (because of lack of access to local file system),
     /// we can assume that only some predefined namespace OIDs are used which can fit in u16
     pub fn to_i128(&self) -> i128 {
-        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
         (((self.field1 & 0x7F) as i128) << 120)
             | (((self.field2 & 0xFFFF) as i128) << 104)
             | ((self.field3 as i128) << 72)
@@ -175,7 +157,7 @@ impl Key {
     }
 
     /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
+    /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys).
     pub fn from_slice(b: &[u8]) -> Self {
         Key {
             field1: b[0],
@@ -188,7 +170,7 @@ impl Key {
     }
 
     /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
+    /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys).
     pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
         buf[0] = self.field1;
         BE::write_u32(&mut buf[1..5], self.field2);
@@ -687,10 +669,15 @@ mod tests {
         let mut metadata_key = vec![AUX_KEY_PREFIX];
         metadata_key.extend_from_slice(&[0xFF; 15]);
         let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        let output_key = encoded_key.to_i128().to_be_bytes();
         assert_eq!(metadata_key, output_key);
         assert!(encoded_key.is_metadata_key());
         assert!(is_metadata_key_slice(&metadata_key));
     }
+
+    #[test]
+    fn test_possible_largest_key() {
+        Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF);
+        // TODO: put this key into the system and see if anything breaks.
+    }
 }

From 33395dcf4ef137b39d3ba8022e9e4d07e7de9ed4 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 30 May 2024 10:31:57 -0400
Subject: [PATCH 15/31] perf(pageserver): postpone vectored get fringe keyspace
 construction (#7904)

Perf shows a significant amount of time is spent on `Keyspace::merge`.
This pull request postpones merging keyspace until retrieving the layer,
which contributes to a 30x improvement in aux keyspace basebackup time.

```
--- old
10000 files found in 0.580569459s
--- new
10000 files found in 0.02995075s
```

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/pagebench/src/cmd/aux_files.rs | 17 ++++++++++++-----
 pageserver/src/tenant/storage_layer.rs    | 17 +++++++++++++----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index eb5b242a5f01..bce3285606cc 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -5,6 +5,7 @@ use utils::lsn::Lsn;
 
 use std::collections::HashMap;
 use std::sync::Arc;
+use std::time::Instant;
 
 /// Ingest aux files into the pageserver.
 #[derive(clap::Parser)]
@@ -88,11 +89,17 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
         println!("ingested {file_cnt} files");
     }
 
-    let files = mgmt_api_client
-        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-        .await?;
-
-    println!("{} files found", files.len());
+    for _ in 0..100 {
+        let start = Instant::now();
+        let files = mgmt_api_client
+            .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+            .await?;
+        println!(
+            "{} files found in {}s",
+            files.len(),
+            start.elapsed().as_secs_f64()
+        );
+    }
 
     anyhow::Ok(())
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9ccf20c0d400..0b3f841ccf8e 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
     layer: ReadableLayer,
-    target_keyspace: KeySpace,
+    target_keyspace: Vec<KeySpace>,
 }
 
 impl LayerFringe {
@@ -336,6 +336,7 @@ impl LayerFringe {
         };
 
         let removed = self.layers.remove_entry(&read_desc.layer_id);
+
         match removed {
             Some((
                 _,
@@ -343,7 +344,15 @@ impl LayerFringe {
                     layer,
                     target_keyspace,
                 },
-            )) => Some((layer, target_keyspace, read_desc.lsn_range)),
+            )) => {
+                let mut keyspace = KeySpaceRandomAccum::new();
+                for ks in target_keyspace {
+                    for part in ks.ranges {
+                        keyspace.add_range(part);
+                    }
+                }
+                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
+            }
             None => unreachable!("fringe internals are always consistent"),
         }
     }
@@ -358,7 +367,7 @@ impl LayerFringe {
         let entry = self.layers.entry(layer_id.clone());
         match entry {
             Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.merge(&keyspace);
+                entry.get_mut().target_keyspace.push(keyspace);
             }
             Entry::Vacant(entry) => {
                 self.planned_reads_by_lsn.push(ReadDesc {
@@ -367,7 +376,7 @@ impl LayerFringe {
                 });
                 entry.insert(LayerKeyspace {
                     layer,
-                    target_keyspace: keyspace,
+                    target_keyspace: vec![keyspace],
                 });
             }
         }

From f20a9e760fc7371c84c550adcb3fe6c553610c96 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 30 May 2024 10:45:34 -0400
Subject: [PATCH 16/31] chore(pageserver): warn on delete non-existing file
 (#7847)

Consider the following sequence of migration:

```
1. user starts compute
2. force migrate to v2
3. user continues to write data
```

At the time of (3), the compute node is not aware that the page server
does not contain replication states any more, and might continue to
ingest neon-file records into the safekeeper. This will leave the
pageserver store a partial replication state and cause some errors. For
example, the compute could issue a deletion of some aux files in v1, but
this file does not exist in v2. Therefore, we should ignore all these
errors until everyone is migrated to v2.

Also note that if we see this warning in prod, it is likely because we
did not fully suspend users' compute when flipping the v1/v2 flag.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index afba34c6d154..4480c7df6e68 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1552,7 +1552,7 @@ impl<'a> DatadirModification<'a> {
                     self.tline.aux_file_size_estimator.on_add(content.len());
                     new_files.push((path, content));
                 }
-                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+                (None, true) => warn!("removing non-existing aux file: {}", path),
             }
             let new_val = aux_file::encode_file_value(&new_files)?;
             self.put(key, Value::Image(new_val.into()));

From c18b1c06460f1a55d947cdad267da4f71278655c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 30 May 2024 17:45:48 +0200
Subject: [PATCH 17/31] Update tokio-epoll-uring for linux-raw-sys (#7918)

Updates the `tokio-epoll-uring` dependency.

There is [only one change](https://github.com/neondatabase/tokio-epoll-uring/compare/342ddd197a060a8354e8f11f4d12994419fff939...08ccfa94ff5507727bf4d8d006666b5b192e04c6),
the adoption of linux-raw-sys for `statx` instead of using libc.

Part of #7889.
---
 Cargo.lock | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 44edbabaf6ad..96ba5c8ec38b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2915,6 +2915,12 @@ version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
 
+[[package]]
+name = "linux-raw-sys"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -6157,7 +6163,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6669,11 +6675,12 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
 dependencies = [
  "bytes",
  "io-uring",
  "libc",
+ "linux-raw-sys 0.6.4",
 ]
 
 [[package]]

From 98dadf854383f7d96cd6b30c87ba49b1b5a48d1e Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 09:18:58 +0100
Subject: [PATCH 18/31] pageserver: quieten some shutdown logs around logical
 size and flush (#7907)

## Problem

Looking at several noisy shutdown logs:
- In https://github.com/neondatabase/neon/issues/7861 we're hitting a
log error with `InternalServerError(timeline shutting down\n'` on the
checkpoint API handler.
- In the field, we see initial_logical_size_calculation errors on
shutdown, via DownloadError
- In the field, we see errors logged from layer download code
(independent of the error propagated) during shutdown

Closes: https://github.com/neondatabase/neon/issues/7861

## Summary of changes

The theme of these changes is to avoid propagating anyhow::Errors for
cases that aren't really unexpected error cases that we might want a
stacktrace for, and avoid "Other" error variants unless we really do
have unexpected error cases to propagate.

- On the flush_frozen_layers path, use the `FlushLayerError` type
throughout, rather than munging it into an anyhow::Error. Give
FlushLayerError an explicit from_anyhow helper that checks for timeline
cancellation, and uses it to give a Cancelled error instead of an Other
error when the timeline is shutting down.
- In logical size calculation, remove BackgroundCalculationError (this
type was just a Cancelled variant and an Other variant), and instead use
CalculateLogicalSizeError throughout. This can express a
PageReconstructError, and has a From impl that translates cancel-like
page reconstruct errors to Cancelled.
- Replace CalculateLogicalSizeError's Other(anyhow::Error) variant case
with a Decode(DeserializeError) variant, as this was the only kind of
error we actually used in the Other case.
- During layer download, drop out early if the timeline is shutting
down, so that we don't do an `error!()` log of the shutdown error in
this case.
---
 pageserver/src/http/routes.rs                 |  16 ++-
 pageserver/src/page_service.rs                |   6 +-
 pageserver/src/pgdatadir_mapping.rs           |  18 +++-
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  10 +-
 pageserver/src/tenant/timeline.rs             | 100 ++++++++++--------
 .../src/tenant/timeline/detach_ancestor.rs    |   4 +-
 test_runner/regress/test_ondemand_download.py |   2 +-
 8 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8a061f3ae15c..913d45d63c6d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,6 +74,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
@@ -1813,11 +1814,22 @@ async fn timeline_checkpoint_handler(
         timeline
             .freeze_and_flush()
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| {
+                match e {
+                    tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+                    other => ApiError::InternalServerError(other.into()),
+
+                }
+            })?;
         timeline
             .compact(&cancel, flags, &ctx)
             .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(|e|
+                match e {
+                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                    CompactionError::Other(e) => ApiError::InternalServerError(e)
+                }
+            )?;
 
         if wait_until_uploaded {
             timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index e9651165b106..35150b210e35 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -66,6 +66,7 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
+use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -830,7 +831,10 @@ impl PageServerHandler {
         // We only want to persist the data, and it doesn't matter if it's in the
         // shape of deltas or images.
         info!("flushing layers");
-        timeline.freeze_and_flush().await?;
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            FlushLayerError::Cancelled => QueryError::Shutdown,
+            other => QueryError::Other(other.into()),
+        })?;
 
         info!("done");
         Ok(())
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 4480c7df6e68..0fc846e5f391 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -78,11 +78,19 @@ pub enum LsnForTimestamp {
 }
 
 #[derive(Debug, thiserror::Error)]
-pub enum CalculateLogicalSizeError {
+pub(crate) enum CalculateLogicalSizeError {
     #[error("cancelled")]
     Cancelled,
+
+    /// Something went wrong while reading the metadata we use to calculate logical size
+    /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`]
+    /// in the `From` implementation for this variant.
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    PageRead(PageReconstructError),
+
+    /// Something went wrong deserializing metadata that we read to calculate logical size
+    #[error("decode error: {0}")]
+    Decode(#[from] DeserializeError),
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -110,7 +118,7 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
             PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
                 Self::Cancelled
             }
-            _ => Self::Other(pre.into()),
+            _ => Self::PageRead(pre),
         }
     }
 }
@@ -763,7 +771,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn get_current_logical_size_non_incremental(
+    pub(crate) async fn get_current_logical_size_non_incremental(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -772,7 +780,7 @@ impl Timeline {
 
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
+        let dbdir = DbDirectory::des(&buf)?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e6bfd57a447a..311338554c6a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4154,7 +4154,7 @@ mod tests {
                 .await?;
             writer.finish_write(lsn);
         }
-        tline.freeze_and_flush().await
+        tline.freeze_and_flush().await.map_err(|e| e.into())
     }
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 3ac799c69a21..1ec13882da8c 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -366,7 +366,10 @@ impl Layer {
             .0
             .get_or_maybe_download(true, Some(ctx))
             .await
-            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;
 
         self.0
             .access_stats
@@ -1158,6 +1161,11 @@ impl LayerInner {
                 let consecutive_failures =
                     1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
 
+                if timeline.cancel.is_cancelled() {
+                    // If we're shutting down, drop out before logging the error
+                    return Err(e);
+                }
+
                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
 
                 let backoff = utils::backoff::exponential_backoff_duration_seconds(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d4f6e25843b7..d07e2352fa8c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -138,7 +138,7 @@ use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub(super) enum FlushLoopState {
+pub(crate) enum FlushLoopState {
     NotStarted,
     Running {
         #[cfg(test)]
@@ -577,7 +577,7 @@ impl PageReconstructError {
 }
 
 #[derive(thiserror::Error, Debug)]
-enum CreateImageLayersError {
+pub(crate) enum CreateImageLayersError {
     #[error("timeline shutting down")]
     Cancelled,
 
@@ -591,17 +591,35 @@ enum CreateImageLayersError {
     Other(#[from] anyhow::Error),
 }
 
-#[derive(thiserror::Error, Debug)]
-enum FlushLayerError {
+#[derive(thiserror::Error, Debug, Clone)]
+pub(crate) enum FlushLayerError {
     /// Timeline cancellation token was cancelled
     #[error("timeline shutting down")]
     Cancelled,
 
+    /// We tried to flush a layer while the Timeline is in an unexpected state
+    #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")]
+    NotRunning(FlushLoopState),
+
+    // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush
+    // loop via a watch channel, where we can only borrow it.
     #[error(transparent)]
-    CreateImageLayersError(CreateImageLayersError),
+    CreateImageLayersError(Arc<CreateImageLayersError>),
 
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(#[from] Arc<anyhow::Error>),
+}
+
+impl FlushLayerError {
+    // When crossing from generic anyhow errors to this error type, we explicitly check
+    // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err.
+    fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self {
+        if timeline.cancel.is_cancelled() {
+            Self::Cancelled
+        } else {
+            Self::Other(Arc::new(err))
+        }
+    }
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -696,7 +714,7 @@ impl From<CreateImageLayersError> for FlushLayerError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
             CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
-            any => FlushLayerError::CreateImageLayersError(any),
+            any => FlushLayerError::CreateImageLayersError(Arc::new(any)),
         }
     }
 }
@@ -1547,13 +1565,13 @@ impl Timeline {
 
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
         self.freeze_and_flush0().await
     }
 
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
-    pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
         let to_lsn = self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait(to_lsn).await
     }
@@ -2735,11 +2753,6 @@ impl Timeline {
             self.current_logical_size.initialized.add_permits(1);
         }
 
-        enum BackgroundCalculationError {
-            Cancelled,
-            Other(anyhow::Error),
-        }
-
         let try_once = |attempt: usize| {
             let background_ctx = &background_ctx;
             let self_ref = &self;
@@ -2757,10 +2770,10 @@ impl Timeline {
                         (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                     }
                     _ = self_ref.cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                     }
                     _ = cancel.cancelled() => {
-                        return Err(BackgroundCalculationError::Cancelled);
+                        return Err(CalculateLogicalSizeError::Cancelled);
                     },
                     () = skip_concurrency_limiter.cancelled() => {
                         // Some action that is part of a end user interaction requested logical size
@@ -2787,18 +2800,7 @@ impl Timeline {
                     .await
                 {
                     Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
-                    Err(CalculateLogicalSizeError::Cancelled) => {
-                        Err(BackgroundCalculationError::Cancelled)
-                    }
-                    Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(PageReconstructError::AncestorStopping(_)) =
-                            err.root_cause().downcast_ref()
-                        {
-                            Err(BackgroundCalculationError::Cancelled)
-                        } else {
-                            Err(BackgroundCalculationError::Other(err))
-                        }
-                    }
+                    Err(e) => Err(e),
                 }
             }
         };
@@ -2810,8 +2812,11 @@ impl Timeline {
 
                 match try_once(attempt).await {
                     Ok(res) => return ControlFlow::Continue(res),
-                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
-                    Err(BackgroundCalculationError::Other(e)) => {
+                    Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()),
+                    Err(
+                        e @ (CalculateLogicalSizeError::Decode(_)
+                        | CalculateLogicalSizeError::PageRead(_)),
+                    ) => {
                         warn!(attempt, "initial size calculation failed: {e:?}");
                         // exponential back-off doesn't make sense at these long intervals;
                         // use fixed retry interval with generous jitter instead
@@ -3717,7 +3722,9 @@ impl Timeline {
                         return;
                     }
                     err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
+                        FlushLayerError::NotRunning(_)
+                        | FlushLayerError::Other(_)
+                        | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
                         break err.map(|_| ());
@@ -3763,7 +3770,10 @@ impl Timeline {
     /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
     /// it means no data will be written between the top of the highest frozen layer and to_lsn,
     /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> {
+    async fn flush_frozen_layers_and_wait(
+        &self,
+        last_record_lsn: Lsn,
+    ) -> Result<(), FlushLayerError> {
         let mut rx = self.layer_flush_done_tx.subscribe();
 
         // Increment the flush cycle counter and wake up the flush task.
@@ -3774,7 +3784,7 @@ impl Timeline {
 
         let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
         if !matches!(flush_loop_state, FlushLoopState::Running { .. }) {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
+            return Err(FlushLayerError::NotRunning(flush_loop_state));
         }
 
         self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
@@ -3787,14 +3797,11 @@ impl Timeline {
             {
                 let (last_result_counter, last_result) = &*rx.borrow();
                 if *last_result_counter >= my_flush_request {
-                    if let Err(_err) = last_result {
+                    if let Err(err) = last_result {
                         // We already logged the original error in
                         // flush_loop. We cannot propagate it to the caller
                         // here, because it might not be Cloneable
-                        anyhow::bail!(
-                            "Could not flush frozen layer. Request id: {}",
-                            my_flush_request
-                        );
+                        return Err(err.clone());
                     } else {
                         return Ok(());
                     }
@@ -3803,7 +3810,7 @@ impl Timeline {
             trace!("waiting for flush to complete");
             tokio::select! {
                 rx_e = rx.changed() => {
-                    rx_e?;
+                    rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?;
                 },
                 // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
                 // the notification from [`flush_loop`] that it completed.
@@ -3875,7 +3882,8 @@ impl Timeline {
                     EnumSet::empty(),
                     ctx,
                 )
-                .await?;
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
 
             if self.cancel.is_cancelled() {
                 return Err(FlushLayerError::Cancelled);
@@ -3899,7 +3907,8 @@ impl Timeline {
                     Some(metadata_keyspace.0.ranges[0].clone()),
                     ctx,
                 )
-                .await?
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
             } else {
                 None
             };
@@ -3926,7 +3935,11 @@ impl Timeline {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
             // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
+            let Some(layer) = self
+                .create_delta_layer(&frozen_layer, None, ctx)
+                .await
+                .map_err(|e| FlushLayerError::from_anyhow(self, e))?
+            else {
                 panic!("delta layer cannot be empty if no filter is applied");
             };
             (
@@ -3959,7 +3972,8 @@ impl Timeline {
 
             if self.set_disk_consistent_lsn(disk_consistent_lsn) {
                 // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)
+                    .map_err(|e| FlushLayerError::from_anyhow(self, e))?;
             }
             // release lock on 'layers'
         };
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index e6ddabe5b5c6..4fc89330ba42 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use super::{layer_manager::LayerManager, Timeline};
+use super::{layer_manager::LayerManager, FlushLayerError, Timeline};
 use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
@@ -23,7 +23,7 @@ pub(crate) enum Error {
     #[error("shutting down, please retry later")]
     ShuttingDown,
     #[error("flushing failed")]
-    FlushAncestor(#[source] anyhow::Error),
+    FlushAncestor(#[source] FlushLayerError),
     #[error("layer download failed")]
     RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
     #[error("copying LSN prefix locally failed")]
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b137fb3a5c2a..6fe23846c733 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -402,7 +402,7 @@ def get_resident_physical_size():
     env.pageserver.allowed_errors.extend(
         [
             ".*download failed: downloading evicted layer file failed.*",
-            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
+            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed",
         ]
     )
 

From e6db8069b0a5476504e723ff1b5bbe079afbac0b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 21:28:44 +0300
Subject: [PATCH 19/31] neon_walreader: check after local read that the segment
 still exists.

Otherwise read might receive zeros/garbage if the file is recycled (renamed) for
as a future segment.
---
 pgxn/neon/neon_walreader.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index e43f4d9d96c0..60eb8e1fc985 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -184,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti
 	}
 	else if (state->wre_errno == ENOENT)
 	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
+		nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr), count);
 		return NeonWALReadRemote(state, buf, startptr, count, tli);
 	}
 	else
@@ -614,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 		uint32		startoff;
 		int			segbytes;
 		int			readbytes;
+		XLogSegNo	lastRemovedSegNo;
 
 		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
 
@@ -689,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 			return false;
 		}
 
+		/*
+		 * Recheck that the segment hasn't been removed while we were reading
+		 * it.
+		 */
+		lastRemovedSegNo = XLogGetLastRemovedSegno();
+		if (state->seg.ws_segno <= lastRemovedSegNo)
+		{
+			char		fname[MAXFNAMELEN];
+
+			state->wre_errno = ENOENT;
+
+			XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+			snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT,
+					 fname, lastRemovedSegNo);
+			return false;
+		}
+
 		/* Update state for read */
 		recptr += readbytes;
 		nbytes -= readbytes;

From af40bf3c2ee11f23ef10f8e2d99edc3024137a3b Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 13:34:04 +0300
Subject: [PATCH 20/31] Fix term/epoch confusion in python tests.

Call epoch last_log_term and add separate term field.
---
 test_runner/fixtures/safekeeper/http.py  |  6 ++++--
 test_runner/regress/test_wal_acceptor.py | 10 +++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index a5480f557f5b..11e6fef28f17 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -19,7 +19,8 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
-    acceptor_epoch: int
+    term: int
+    last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
     flush_lsn: Lsn
     commit_lsn: Lsn
@@ -156,7 +157,8 @@ def timeline_status(
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         return SafekeeperTimelineStatus(
-            acceptor_epoch=resj["acceptor_state"]["epoch"],
+            term=resj["acceptor_state"]["term"],
+            last_log_term=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
             flush_lsn=Lsn(resj["flush_lsn"]),
             commit_lsn=Lsn(resj["commit_lsn"]),
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cff13e74ee4b..7dca6c3ec2be 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -841,7 +841,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
     # fetch something sensible from status
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch = tli_status.acceptor_epoch
+    term = tli_status.term
     timeline_start_lsn = tli_status.timeline_start_lsn
 
     if auth_enabled:
@@ -862,8 +862,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     endpoint.safe_psql("insert into t values(10)")
 
     tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
-    epoch_after_reboot = tli_status.acceptor_epoch
-    assert epoch_after_reboot > epoch
+    term_after_reboot = tli_status.term
+    assert term_after_reboot > term
 
     # and timeline_start_lsn stays the same
     assert tli_status.timeline_start_lsn == timeline_start_lsn
@@ -1104,11 +1104,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline
     # First check that term / flush_lsn are the same: it is easier to
     # report/understand if WALs are different due to that.
     statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
-    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses]
     for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
         assert (
             term_flush_lsns[0] == tfl
-        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+        ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
 
     # check that WALs are identic.
     segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]

From 1fcc2b37eba2e3be3f248c50713a6cda02a99453 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 13:43:28 +0300
Subject: [PATCH 21/31] Add test checking term change during pull_timeline.

---
 test_runner/regress/test_wal_acceptor.py | 60 ++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 7dca6c3ec2be..dce30f5388fe 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -17,6 +17,7 @@
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+import requests
 from fixtures.broker import NeonBroker
 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
@@ -1867,6 +1868,65 @@ def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder):
     assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}"
 
 
+# Test pull_timeline while concurrently changing term on the donor:
+# 1) Start pull_timeline, listing files to fetch.
+# 2) Change term on the donor
+# 3) Finish pull_timeline.
+#
+# Currently (until proper membership change procedure), we want to pull_timeline
+# to fetch the log up to <last_log_term, flush_lsn>. This is unsafe if term
+# changes during the procedure (unless timeline is locked all the time but we
+# don't want that): recepient might end up with mix of WAL from different
+# histories. Thus the schedule above is expected to fail. Later we'd allow
+# pull_timeline to only initialize timeline to any valid state (up to
+# commit_lsn), holding switch to fully new configuration until it recovers
+# enough, so it won't be affected by term change anymore.
+#
+# Expected to fail while term check is not implemented.
+@pytest.mark.xfail
+def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2])
+
+    log.info("use only first 2 safekeepers, 3rd will be seeded")
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
+
+    dst_http = dst_sk.http_client()
+    # run pull_timeline which will halt before downloading files
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "pause"))
+    pt_handle = PropagatingThread(
+        target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id)
+    )
+    pt_handle.start()
+    dst_sk.wait_until_paused("sk-pull-timeline-after-list-pausable")
+
+    src_http = src_sk.http_client()
+    term_before = src_http.timeline_status(tenant_id, timeline_id).term
+
+    # restart compute to bump term
+    ep.stop()
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [1, 2]
+    ep.start()
+    ep.safe_psql("insert into t select generate_series(1, 100), 'pear'")
+
+    term_after = src_http.timeline_status(tenant_id, timeline_id).term
+    assert term_after > term_before, f"term_after={term_after}, term_before={term_before}"
+
+    dst_http.configure_failpoints(("sk-pull-timeline-after-list-pausable", "off"))
+    with pytest.raises(requests.exceptions.HTTPError):
+        pt_handle.join()
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 7ec70b5eff731f6072e4804050d49f0b951ef872 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 28 May 2024 12:47:04 +0300
Subject: [PATCH 22/31] safekeeper: rename epoch to last_log_term.

epoch is a historical and potentially confusing name. It semantically means
lastLogTerm from the raft paper, so let's use it.

This commit changes only internal namings, not public interface (http).
---
 safekeeper/src/http/routes.rs |  8 +++---
 safekeeper/src/json_ctrl.rs   |  2 +-
 safekeeper/src/recovery.rs    |  2 +-
 safekeeper/src/safekeeper.rs  | 53 ++++++++++++++++++-----------------
 safekeeper/src/timeline.rs    |  8 +++---
 5 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4aacd3421dc0..593e102e353f 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -85,11 +85,11 @@ impl From<TermSwitchApiEntry> for TermLsn {
     }
 }
 
-/// Augment AcceptorState with epoch for convenience
+/// Augment AcceptorState with last_log_term for convenience
 #[derive(Debug, Serialize, Deserialize)]
 pub struct AcceptorStateStatus {
     pub term: Term,
-    pub epoch: Term,
+    pub epoch: Term, // aka last_log_term
     pub term_history: Vec<TermSwitchApiEntry>,
 }
 
@@ -130,7 +130,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     let (inmem, state) = tli.get_state().await;
     let flush_lsn = tli.get_flush_lsn().await;
 
-    let epoch = state.acceptor_state.get_epoch(flush_lsn);
+    let last_log_term = state.acceptor_state.get_last_log_term(flush_lsn);
     let term_history = state
         .acceptor_state
         .term_history
@@ -143,7 +143,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
         .collect();
     let acc_state = AcceptorStateStatus {
         term: state.acceptor_state.term,
-        epoch,
+        epoch: last_log_term,
         term_history,
     };
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 32d58898033f..f4e491cc5fe4 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -165,7 +165,7 @@ pub async fn append_logical_message(
     let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest {
         h: AppendRequestHeader {
             term: msg.term,
-            epoch_start_lsn: begin_lsn,
+            term_start_lsn: begin_lsn,
             begin_lsn,
             end_lsn,
             commit_lsn,
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index dfa1892c40a6..568a512c4ac6 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -337,7 +337,7 @@ async fn network_io(
             ReplicationMessage::XLogData(xlog_data) => {
                 let ar_hdr = AppendRequestHeader {
                     term: donor.term,
-                    epoch_start_lsn: Lsn::INVALID, // unused
+                    term_start_lsn: Lsn::INVALID, // unused
                     begin_lsn: Lsn(xlog_data.wal_start()),
                     end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
                     commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 2a620f5fefc8..4686c9aa8ea2 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -188,8 +188,8 @@ pub struct AcceptorState {
 }
 
 impl AcceptorState {
-    /// acceptor's epoch is the term of the highest entry in the log
-    pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
+    /// acceptor's last_log_term is the term of the highest entry in the log
+    pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term {
         let th = self.term_history.up_to(flush_lsn);
         match th.0.last() {
             Some(e) => e.term,
@@ -305,9 +305,9 @@ pub struct AppendRequest {
 pub struct AppendRequestHeader {
     // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
     pub term: Term,
-    // TODO: remove this field, it in unused -- LSN of term switch can be taken
-    // from ProposerElected (as well as from term history).
-    pub epoch_start_lsn: Lsn,
+    // TODO: remove this field from the protocol, it in unused -- LSN of term
+    // switch can be taken from ProposerElected (as well as from term history).
+    pub term_start_lsn: Lsn,
     /// start position of message in WAL
     pub begin_lsn: Lsn,
     /// end position of message in WAL
@@ -326,9 +326,10 @@ pub struct AppendResponse {
     // Current term of the safekeeper; if it is higher than proposer's, the
     // compute is out of date.
     pub term: Term,
-    // NOTE: this is physical end of wal on safekeeper; currently it doesn't
-    // make much sense without taking epoch into account, as history can be
-    // diverged.
+    // Flushed end of wal on safekeeper; one should be always mindful from what
+    // term history this value comes, either checking history directly or
+    // observing term being set to one for which WAL truncation is known to have
+    // happened.
     pub flush_lsn: Lsn,
     // We report back our awareness about which WAL is committed, as this is
     // a criterion for walproposer --sync mode exit
@@ -482,8 +483,8 @@ impl AcceptorProposerMessage {
 /// - messages from broker peers
 pub struct SafeKeeper<CTRL: control_file::Storage, WAL: wal_storage::Storage> {
     /// LSN since the proposer safekeeper currently talking to appends WAL;
-    /// determines epoch switch point.
-    pub epoch_start_lsn: Lsn,
+    /// determines last_log_term switch point.
+    pub term_start_lsn: Lsn,
 
     pub state: TimelineState<CTRL>, // persistent state storage
     pub wal_store: WAL,
@@ -511,7 +512,7 @@ where
         }
 
         Ok(SafeKeeper {
-            epoch_start_lsn: Lsn(0),
+            term_start_lsn: Lsn(0),
             state: TimelineState::new(state),
             wal_store,
             node_id,
@@ -531,8 +532,10 @@ where
         self.state.acceptor_state.term
     }
 
-    pub fn get_epoch(&self) -> Term {
-        self.state.acceptor_state.get_epoch(self.flush_lsn())
+    pub fn get_last_log_term(&self) -> Term {
+        self.state
+            .acceptor_state
+            .get_last_log_term(self.flush_lsn())
     }
 
     /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
@@ -713,7 +716,7 @@ where
         // proceed, but to prevent commit_lsn surprisingly going down we should
         // either refuse the session (simpler) or skip the part we already have
         // from the stream (can be implemented).
-        if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at {
+        if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at {
             bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
                    msg.term, self.flush_lsn(), msg.start_streaming_at)
         }
@@ -788,7 +791,7 @@ where
         // Cache LSN where term starts to immediately fsync control file with
         // commit_lsn once we reach it -- sync-safekeepers finishes when
         // persisted commit_lsn on majority of safekeepers aligns.
-        self.epoch_start_lsn = match msg.term_history.0.last() {
+        self.term_start_lsn = match msg.term_history.0.last() {
             None => bail!("proposer elected with empty term history"),
             Some(term_lsn_start) => term_lsn_start.lsn,
         };
@@ -814,11 +817,11 @@ where
 
         self.state.inmem.commit_lsn = commit_lsn;
 
-        // If new commit_lsn reached epoch switch, force sync of control
+        // If new commit_lsn reached term switch, force sync of control
         // file: walproposer in sync mode is very interested when this
         // happens. Note: this is for sync-safekeepers mode only, as
-        // otherwise commit_lsn might jump over epoch_start_lsn.
-        if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
+        // otherwise commit_lsn might jump over term_start_lsn.
+        if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn {
             self.state.flush().await?;
         }
 
@@ -933,7 +936,7 @@ where
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
-            if sk_info.last_log_term == self.get_epoch() {
+            if sk_info.last_log_term == self.get_last_log_term() {
                 self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
             }
         }
@@ -1079,7 +1082,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_epoch_switch() {
+    async fn test_last_log_term_switch() {
         let storage = InMemoryState {
             persisted_state: test_sk_state(),
         };
@@ -1089,7 +1092,7 @@ mod tests {
 
         let mut ar_hdr = AppendRequestHeader {
             term: 1,
-            epoch_start_lsn: Lsn(3),
+            term_start_lsn: Lsn(3),
             begin_lsn: Lsn(1),
             end_lsn: Lsn(2),
             commit_lsn: Lsn(0),
@@ -1114,14 +1117,14 @@ mod tests {
             .await
             .unwrap();
 
-        // check that AppendRequest before epochStartLsn doesn't switch epoch
+        // check that AppendRequest before term_start_lsn doesn't switch last_log_term.
         let resp = sk
             .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
             .await;
         assert!(resp.is_ok());
-        assert_eq!(sk.get_epoch(), 0);
+        assert_eq!(sk.get_last_log_term(), 0);
 
-        // but record at epochStartLsn does the switch
+        // but record at term_start_lsn does the switch
         ar_hdr.begin_lsn = Lsn(2);
         ar_hdr.end_lsn = Lsn(3);
         append_request = AppendRequest {
@@ -1133,7 +1136,7 @@ mod tests {
             .await;
         assert!(resp.is_ok());
         sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
-        assert_eq!(sk.get_epoch(), 1);
+        assert_eq!(sk.get_last_log_term(), 1);
     }
 
     #[test]
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f30c503382e5..aa9ccfc21eca 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -244,7 +244,7 @@ impl SharedState {
                 timeline_id: ttid.timeline_id.as_ref().to_owned(),
             }),
             term: self.sk.state.acceptor_state.term,
-            last_log_term: self.sk.get_epoch(),
+            last_log_term: self.sk.get_last_log_term(),
             flush_lsn: self.sk.flush_lsn().0,
             // note: this value is not flushed to control file yet and can be lost
             commit_lsn: self.sk.state.inmem.commit_lsn.0,
@@ -704,7 +704,7 @@ impl Timeline {
     pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
         let ss = self.read_shared_state().await;
         let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_epoch();
+        let last_log_term = ss.sk.get_last_log_term();
         let flush_lsn = ss.sk.flush_lsn();
         // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
         let mut peers = ss.get_peers(heartbeat_timeout);
@@ -844,7 +844,7 @@ impl Timeline {
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             persisted_state: state.sk.state.clone(),
             flush_lsn: state.sk.wal_store.flush_lsn(),
@@ -867,7 +867,7 @@ impl Timeline {
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
+            epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             write_lsn,
             write_record_lsn,

From 5a394fde566eea238e317604a3e9b414bed04d1b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 13:31:42 +0100
Subject: [PATCH 23/31] pageserver: avoid spurious "bad state" logs/errors
 during shutdown (#7912)

## Problem

- Initial size calculations tend to fail with `Bad state (not active)`

Closes: https://github.com/neondatabase/neon/issues/7911

## Summary of changes

- In `wait_lsn`, return WaitLsnError::Cancelled rather than BadState
when the state is Stopping
- Replace PageReconstructError's `Other` variant with a specific
`BadState` variant
- Avoid returning anyhow::Error from get_ready_ancestor_timeline -- this
was only used for the case where there was no ancestor. All callers of
this function had implicitly checked that the ancestor timeline exists
before calling it, so they can pass in the ancestor instead of handling
an error.
---
 pageserver/src/page_service.rs    |  4 +-
 pageserver/src/tenant.rs          |  9 ++--
 pageserver/src/tenant/timeline.rs | 89 +++++++++++++++----------------
 3 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 35150b210e35..ae389826d5fc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -373,7 +373,7 @@ impl From<WaitLsnError> for PageStreamError {
         match value {
             e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
             WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+            e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
         }
     }
 }
@@ -383,7 +383,7 @@ impl From<WaitLsnError> for QueryError {
         match value {
             e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
             WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect,
+            WaitLsnError::BadState { .. } => Self::Reconnect,
         }
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 311338554c6a..0a9637884fae 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1507,7 +1507,7 @@ impl Tenant {
                         .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                         .await
                         .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
                                 CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                             }
                             WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
@@ -4308,9 +4308,10 @@ mod tests {
 
         // This needs to traverse to the parent, and fails.
         let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
-        assert!(err
-            .to_string()
-            .contains("will not become active. Current state: Broken"));
+        assert!(err.to_string().starts_with(&format!(
+            "Bad state on timeline {}: Broken",
+            tline.timeline_id
+        )));
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d07e2352fa8c..b49887646549 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -496,7 +496,7 @@ pub(crate) enum PageReconstructError {
     Other(#[from] anyhow::Error),
 
     #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
+    AncestorLsnTimeout(WaitLsnError),
 
     #[error("timeline shutting down")]
     Cancelled,
@@ -651,11 +651,14 @@ pub(crate) enum GetReadyAncestorError {
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
+    #[error("Bad state on timeline {timeline_id}: {state:?}")]
+    BadState {
+        timeline_id: TimelineId,
+        state: TimelineState,
+    },
+
     #[error("Cancelled")]
     Cancelled,
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }
 
 #[derive(Clone, Copy)]
@@ -690,8 +693,8 @@ pub(crate) enum WaitLsnError {
     Shutdown,
 
     // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
+    #[error("Bad timeline state: {0:?}")]
+    BadState(TimelineState),
 
     // Timeout expired while waiting for LSN to catch up with goal.
     #[error("{0}")]
@@ -756,8 +759,8 @@ impl From<GetReadyAncestorError> for PageReconstructError {
         match e {
             AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
             AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
             Cancelled => PageReconstructError::Cancelled,
-            Other(other) => PageReconstructError::Other(other),
         }
     }
 }
@@ -1466,10 +1469,11 @@ impl Timeline {
         who_is_waiting: WaitLsnWaiter<'_>,
         ctx: &RequestContext, /* Prepare for use by cancellation */
     ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
+        let state = self.current_state();
+        if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
             return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
+        } else if !matches!(state, TimelineState::Active) {
+            return Err(WaitLsnError::BadState(state));
         }
 
         if cfg!(debug_assertions) {
@@ -3193,17 +3197,21 @@ impl Timeline {
             }
 
             // Recurse into ancestor if needed
-            if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
-                trace!(
-                    "going into ancestor {}, cont_lsn is {}",
-                    timeline.ancestor_lsn,
-                    cont_lsn
-                );
+            if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
+                if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
+                    trace!(
+                        "going into ancestor {}, cont_lsn is {}",
+                        timeline.ancestor_lsn,
+                        cont_lsn
+                    );
 
-                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
-                timeline = &*timeline_owned;
-                prev_lsn = None;
-                continue 'outer;
+                    timeline_owned = timeline
+                        .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                        .await?;
+                    timeline = &*timeline_owned;
+                    prev_lsn = None;
+                    continue 'outer;
+                }
             }
 
             let guard = timeline.layers.read().await;
@@ -3352,10 +3360,10 @@ impl Timeline {
                 break None;
             }
 
-            // Not fully retrieved but no ancestor timeline.
-            if timeline.ancestor_timeline.is_none() {
+            let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
+                // Not fully retrieved but no ancestor timeline.
                 break Some(keyspace);
-            }
+            };
 
             // Now we see if there are keys covered by the image layer but does not exist in the
             // image layer, which means that the key does not exist.
@@ -3375,7 +3383,7 @@ impl Timeline {
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
-                .get_ready_ancestor_timeline(ctx)
+                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
                 .await
                 .map_err(GetVectoredError::GetReadyAncestorError)?;
             timeline = &*timeline_owned;
@@ -3547,13 +3555,9 @@ impl Timeline {
 
     async fn get_ready_ancestor_timeline(
         &self,
+        ancestor: &Arc<Timeline>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
-        let ancestor = match self.get_ancestor_timeline() {
-            Ok(timeline) => timeline,
-            Err(e) => return Err(GetReadyAncestorError::from(e)),
-        };
-
         // It's possible that the ancestor timeline isn't active yet, or
         // is active but hasn't yet caught up to the branch point. Wait
         // for it.
@@ -3586,11 +3590,10 @@ impl Timeline {
                 ));
             }
             Err(state) => {
-                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
-                    "Timeline {} will not become active. Current state: {:?}",
-                    ancestor.timeline_id,
-                    &state,
-                )));
+                return Err(GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                });
             }
         }
         ancestor
@@ -3599,21 +3602,17 @@ impl Timeline {
             .map_err(|e| match e {
                 e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
                 WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
-                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+                WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
+                    timeline_id: ancestor.timeline_id,
+                    state,
+                },
             })?;
 
-        Ok(ancestor)
+        Ok(ancestor.clone())
     }
 
-    pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
-        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
-            format!(
-                "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
-                self.timeline_id,
-                self.get_ancestor_timeline_id(),
-            )
-        })?;
-        Ok(Arc::clone(ancestor))
+    pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
+        self.ancestor_timeline.clone()
     }
 
     pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {

From 16b2e74037dfcacec2ceeef3bc597e75b435cc1b Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Fri, 31 May 2024 14:19:45 +0100
Subject: [PATCH 24/31] Add FullAccessTimeline guard in safekeepers (#7887)

This is a preparation for
https://github.com/neondatabase/neon/issues/6337.

The idea is to add FullAccessTimeline, which will act as a guard for
tasks requiring access to WAL files. Eviction will be blocked on these
tasks and WAL won't be deleted from disk until there is at least one
active FullAccessTimeline.

To get FullAccessTimeline, tasks call `tli.full_access_guard().await?`.
After eviction is implemented, this function will be responsible for
downloading missing WAL file and waiting until the download finishes.

This commit also contains other small refactorings:
- Separate `get_tenant_dir` and `get_timeline_dir` functions for
building a local path. This is useful for looking at usages and finding
tasks requiring access to local filesystem.
- `timeline_manager` is now responsible for spawning all background
tasks
- WAL removal task is now spawned instantly after horizon is updated
---
 libs/remote_storage/src/lib.rs         |   4 +-
 pageserver/src/deletion_queue.rs       |   2 +-
 safekeeper/src/bin/safekeeper.rs       |  11 +-
 safekeeper/src/control_file.rs         |  37 ++-
 safekeeper/src/copy_timeline.rs        |  26 +-
 safekeeper/src/debug_dump.rs           |  38 +--
 safekeeper/src/http/routes.rs          |  20 +-
 safekeeper/src/json_ctrl.rs            |  20 +-
 safekeeper/src/lib.rs                  |  14 +-
 safekeeper/src/pull_timeline.rs        |   6 +-
 safekeeper/src/receive_wal.rs          |  19 +-
 safekeeper/src/recovery.rs             | 100 ++++++-
 safekeeper/src/remove_wal.rs           |  56 ++--
 safekeeper/src/safekeeper.rs           |  19 --
 safekeeper/src/send_wal.rs             |  21 +-
 safekeeper/src/timeline.rs             | 379 +++++++++----------------
 safekeeper/src/timeline_manager.rs     | 329 +++++++++++++++++----
 safekeeper/src/timelines_global_map.rs |  21 +-
 safekeeper/src/wal_backup.rs           | 100 +++----
 safekeeper/src/wal_backup_partial.rs   |  27 +-
 safekeeper/src/wal_storage.rs          |  21 +-
 test_runner/fixtures/common_types.py   |  12 +
 test_runner/fixtures/neon_fixtures.py  |  16 ++
 23 files changed, 724 insertions(+), 574 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 708662f20f45..cb3df0985d61 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -121,8 +121,8 @@ impl RemotePath {
         self.0.file_name()
     }
 
-    pub fn join(&self, segment: &Utf8Path) -> Self {
-        Self(self.0.join(segment))
+    pub fn join(&self, path: impl AsRef<Utf8Path>) -> Self {
+        Self(self.0.join(path))
     }
 
     pub fn get_path(&self) -> &Utf8PathBuf {
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 8790a9b0a82d..3960fc1b999d 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -311,7 +311,7 @@ impl DeletionList {
                 result.extend(
                     timeline_layers
                         .into_iter()
-                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
                 );
             }
         }
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index aee3898ac792..7476654426a8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -29,13 +29,12 @@ use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::remove_wal;
+use safekeeper::http;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
-use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -441,14 +440,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
-    let conf_ = conf.clone();
-    let wal_remover_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
-        .spawn(remove_wal::task_main(conf_))
-        .map(|res| ("WAL remover".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_remover_handle));
-
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // TODO: update tokio-stream, convert to real async Stream with
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index fe9f2e6899d8..e9bb5202dad8 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -2,7 +2,7 @@
 
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -12,9 +12,9 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
-use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::TimelinePersistentState;
+use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
 use utils::{bin_ser::LeSer, id::TenantTimelineId};
 
 use crate::SafeKeeperConf;
@@ -43,7 +43,7 @@ pub trait Storage: Deref<Target = TimelinePersistentState> {
 pub struct FileStorage {
     // save timeline dir to avoid reconstructing it every time
     timeline_dir: Utf8PathBuf,
-    conf: SafeKeeperConf,
+    no_sync: bool,
 
     /// Last state persisted to disk.
     state: TimelinePersistentState,
@@ -54,13 +54,12 @@ pub struct FileStorage {
 impl FileStorage {
     /// Initialize storage by loading state from disk.
     pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
-        let state = Self::load_control_file_conf(conf, ttid)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let state = Self::load_control_file_from_dir(&timeline_dir)?;
 
         Ok(FileStorage {
             timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
             state,
             last_persist_at: Instant::now(),
         })
@@ -74,7 +73,7 @@ impl FileStorage {
     ) -> Result<FileStorage> {
         let store = FileStorage {
             timeline_dir,
-            conf: conf.clone(),
+            no_sync: conf.no_sync,
             state,
             last_persist_at: Instant::now(),
         };
@@ -102,12 +101,9 @@ impl FileStorage {
         upgrade_control_file(buf, version)
     }
 
-    /// Load control file for given ttid at path specified by conf.
-    pub fn load_control_file_conf(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<TimelinePersistentState> {
-        let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME);
+    /// Load control file from given directory.
+    pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result<TimelinePersistentState> {
+        let path = timeline_dir.join(CONTROL_FILE_NAME);
         Self::load_control_file(path)
     }
 
@@ -203,7 +199,7 @@ impl Storage for FileStorage {
         })?;
 
         let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
 
         // update internal state
         self.state = s.clone();
@@ -233,12 +229,13 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
             .await
             .expect("failed to create timeline dir");
         Ok((
             FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_conf(conf, ttid)?,
+            FileStorage::load_control_file_from_dir(&timeline_dir)?,
         ))
     }
 
@@ -246,11 +243,11 @@ mod test {
         conf: &SafeKeeperConf,
         ttid: &TenantTimelineId,
     ) -> Result<(FileStorage, TimelinePersistentState)> {
-        fs::create_dir_all(conf.timeline_dir(ttid))
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        fs::create_dir_all(&timeline_dir)
             .await
             .expect("failed to create timeline dir");
         let state = TimelinePersistentState::empty();
-        let timeline_dir = conf.timeline_dir(ttid);
         let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
         Ok((storage, state))
     }
@@ -291,7 +288,7 @@ mod test {
                 .await
                 .expect("failed to persist state");
         }
-        let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
+        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
         let mut data = fs::read(&control_path).await.unwrap();
         data[0] += 1; // change the first byte of the file to fail checksum validation
         fs::write(&control_path, &data)
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 3023d4e2cb19..51cf4db6b5b6 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -15,10 +15,10 @@ use crate::{
     control_file::{FileStorage, Storage},
     pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
     state::TimelinePersistentState,
-    timeline::{Timeline, TimelineError},
+    timeline::{FullAccessTimeline, Timeline, TimelineError},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
-    GlobalTimelines, SafeKeeperConf,
+    GlobalTimelines,
 };
 
 // we don't want to have more than 10 segments on disk after copy, because they take space
@@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> {
         }
     }
 
+    let source_tli = request.source.full_access_guard().await?;
+
     let conf = &GlobalTimelines::get_global_config();
     let ttid = request.destination_ttid;
 
     let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
-    let (mem_state, state) = request.source.get_state().await;
+    let (mem_state, state) = source_tli.get_state().await;
     let start_lsn = state.timeline_start_lsn;
     if start_lsn == Lsn::INVALID {
         bail!("timeline is not initialized");
@@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
 
     {
         let commit_lsn = mem_state.commit_lsn;
-        let flush_lsn = request.source.get_flush_lsn().await;
+        let flush_lsn = source_tli.get_flush_lsn().await;
 
         info!(
             "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
@@ -127,10 +129,8 @@ pub async fn handle_request(request: Request) -> Result<()> {
     .await?;
 
     copy_disk_segments(
-        conf,
-        &state,
+        &source_tli,
         wal_seg_size,
-        &request.source.ttid,
         new_backup_lsn,
         request.until_lsn,
         &tli_dir_path,
@@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> {
 }
 
 async fn copy_disk_segments(
-    conf: &SafeKeeperConf,
-    persisted_state: &TimelinePersistentState,
+    tli: &FullAccessTimeline,
     wal_seg_size: usize,
-    source_ttid: &TenantTimelineId,
     start_lsn: Lsn,
     end_lsn: Lsn,
     tli_dir_path: &Utf8PathBuf,
 ) -> Result<()> {
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        conf.timeline_dir(source_ttid),
-        persisted_state,
-        start_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(start_lsn).await?;
 
     let mut buf = [0u8; MAX_SEND_SIZE];
 
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index b50f2e1158b8..062ff4b3db79 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
 use postgres_ffi::MAX_SEND_SIZE;
@@ -26,7 +27,8 @@ use crate::safekeeper::TermHistory;
 use crate::send_wal::WalSenderState;
 use crate::state::TimelineMemState;
 use crate::state::TimelinePersistentState;
-use crate::wal_storage::WalReader;
+use crate::timeline::get_timeline_dir;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -68,6 +70,7 @@ pub struct Response {
 pub struct TimelineDumpSer {
     pub tli: Arc<crate::timeline::Timeline>,
     pub args: Args,
+    pub timeline_dir: Utf8PathBuf,
     pub runtime: Arc<tokio::runtime::Runtime>,
 }
 
@@ -85,14 +88,20 @@ impl Serialize for TimelineDumpSer {
     where
         S: serde::Serializer,
     {
-        let dump = self
-            .runtime
-            .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
+        let dump = self.runtime.block_on(build_from_tli_dump(
+            &self.tli,
+            &self.args,
+            &self.timeline_dir,
+        ));
         dump.serialize(serializer)
     }
 }
 
-async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
+async fn build_from_tli_dump(
+    timeline: &Arc<crate::timeline::Timeline>,
+    args: &Args,
+    timeline_dir: &Utf8Path,
+) -> Timeline {
     let control_file = if args.dump_control_file {
         let mut state = timeline.get_state().await.1;
         if !args.dump_term_history {
@@ -112,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Arg
     let disk_content = if args.dump_disk_content {
         // build_disk_content can fail, but we don't want to fail the whole
         // request because of that.
-        build_disk_content(&timeline.timeline_dir).ok()
+        // Note: timeline can be in offloaded state, this is not a problem.
+        build_disk_content(timeline_dir).ok()
     } else {
         None
     };
@@ -186,6 +196,7 @@ pub struct FileInfo {
 pub async fn build(args: Args) -> Result<Response> {
     let start_time = Utc::now();
     let timelines_count = GlobalTimelines::timelines_count();
+    let config = GlobalTimelines::get_global_config();
 
     let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
         // If both tenant_id and timeline_id are specified, we can just get the
@@ -223,12 +234,11 @@ pub async fn build(args: Args) -> Result<Response> {
         timelines.push(TimelineDumpSer {
             tli,
             args: args.clone(),
+            timeline_dir: get_timeline_dir(&config, &ttid),
             runtime: runtime.clone(),
         });
     }
 
-    let config = GlobalTimelines::get_global_config();
-
     Ok(Response {
         start_time,
         finish_time: Utc::now(),
@@ -316,27 +326,19 @@ pub struct TimelineDigest {
 }
 
 pub async fn calculate_digest(
-    tli: &Arc<crate::timeline::Timeline>,
+    tli: &FullAccessTimeline,
     request: TimelineDigestRequest,
 ) -> Result<TimelineDigest> {
     if request.from_lsn > request.until_lsn {
         bail!("from_lsn is greater than until_lsn");
     }
 
-    let conf = GlobalTimelines::get_global_config();
     let (_, persisted_state) = tli.get_state().await;
-
     if persisted_state.timeline_start_lsn > request.from_lsn {
         bail!("requested LSN is before the start of the timeline");
     }
 
-    let mut wal_reader = WalReader::new(
-        conf.workdir.clone(),
-        tli.timeline_dir.clone(),
-        &persisted_state,
-        request.from_lsn,
-        true,
-    )?;
+    let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
 
     let mut hasher = Sha256::new();
     let mut buf = [0u8; MAX_SEND_SIZE];
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 593e102e353f..1e29b21fac6d 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -249,6 +249,10 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     };
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
     let response = debug_dump::calculate_digest(&tli, request)
         .await
@@ -268,8 +272,12 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
     let filename: String = parse_request_param(&request, "filename")?;
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = tli
+        .full_access_guard()
+        .await
+        .map_err(ApiError::InternalServerError)?;
 
-    let filepath = tli.timeline_dir.join(filename);
+    let filepath = tli.get_timeline_dir().join(filename);
     let mut file = File::open(&filepath)
         .await
         .map_err(|e| ApiError::InternalServerError(e.into()))?;
@@ -287,7 +295,7 @@ async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>
         .map_err(|e| ApiError::InternalServerError(e.into()))
 }
 
-/// Force persist control file and remove old WAL.
+/// Force persist control file.
 async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -297,13 +305,13 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     );
 
     let tli = GlobalTimelines::get(ttid)?;
-    tli.maybe_persist_control_file(true)
+    tli.write_shared_state()
         .await
-        .map_err(ApiError::InternalServerError)?;
-    tli.remove_old_wal()
+        .sk
+        .state
+        .flush()
         .await
         .map_err(ApiError::InternalServerError)?;
-
     json_response(StatusCode::OK, ())
 }
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index f4e491cc5fe4..27e54776e0dc 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -6,8 +6,6 @@
 //! modifications in tests.
 //!
 
-use std::sync::Arc;
-
 use anyhow::Context;
 use bytes::Bytes;
 use postgres_backend::QueryError;
@@ -23,7 +21,7 @@ use crate::safekeeper::{
 };
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
@@ -104,8 +102,8 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 async fn prepare_safekeeper(
     ttid: TenantTimelineId,
     pg_version: u32,
-) -> anyhow::Result<Arc<Timeline>> {
-    GlobalTimelines::create(
+) -> anyhow::Result<FullAccessTimeline> {
+    let tli = GlobalTimelines::create(
         ttid,
         ServerInfo {
             pg_version,
@@ -115,10 +113,16 @@ async fn prepare_safekeeper(
         Lsn::INVALID,
         Lsn::INVALID,
     )
-    .await
+    .await?;
+
+    tli.full_access_guard().await
 }
 
-async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
+async fn send_proposer_elected(
+    tli: &FullAccessTimeline,
+    term: Term,
+    lsn: Lsn,
+) -> anyhow::Result<()> {
     // add new term to existing history
     let history = tli.get_state().await.1.acceptor_state.term_history;
     let history = history.up_to(lsn.checked_sub(1u64).unwrap());
@@ -147,7 +151,7 @@ pub struct InsertedWAL {
 /// Extend local WAL with new LogicalMessage record. To do that,
 /// create AppendRequest with new WAL and pass it to safekeeper.
 pub async fn append_logical_message(
-    tli: &Arc<Timeline>,
+    tli: &FullAccessTimeline,
     msg: &AppendLogicalMessage,
 ) -> anyhow::Result<InsertedWAL> {
     let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 8d8d2cf23e9d..1a56ff736c35 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -7,10 +7,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;
 
-use utils::{
-    auth::SwappableJwtAuth,
-    id::{NodeId, TenantId, TenantTimelineId},
-};
+use utils::{auth::SwappableJwtAuth, id::NodeId};
 
 mod auth;
 pub mod broker;
@@ -89,15 +86,6 @@ pub struct SafeKeeperConf {
 }
 
 impl SafeKeeperConf {
-    pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.workdir.join(tenant_id.to_string())
-    }
-
-    pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf {
-        self.tenant_dir(&ttid.tenant_id)
-            .join(ttid.timeline_id.to_string())
-    }
-
     pub fn is_wal_backup_enabled(&self) -> bool {
         self.remote_storage.is_some() && self.wal_backup_enabled
     }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index f7cc40f58adc..7b41c98cb856 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -17,7 +17,7 @@ use utils::{
 use crate::{
     control_file, debug_dump,
     http::routes::TimelineStatus,
-    timeline::{Timeline, TimelineError},
+    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError},
     wal_storage::{self, Storage},
     GlobalTimelines, SafeKeeperConf,
 };
@@ -283,13 +283,13 @@ pub async fn load_temp_timeline(
     }
 
     // Move timeline dir to the correct location
-    let timeline_path = conf.timeline_dir(&ttid);
+    let timeline_path = get_timeline_dir(conf, &ttid);
 
     info!(
         "moving timeline {} from {} to {}",
         ttid, tmp_path, timeline_path
     );
-    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
+    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
     tokio::fs::rename(tmp_path, &timeline_path).await?;
 
     let tli = GlobalTimelines::load_timeline(&guard, ttid)
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 03cfa882c45c..7943a2fd8683 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::GlobalTimelines;
 use anyhow::{anyhow, Context};
@@ -213,7 +213,7 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        let mut tli: Option<Arc<Timeline>> = None;
+        let mut tli: Option<FullAccessTimeline> = None;
         if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
             let handle_end_fut = pgb.handle_copy_stream_end(end);
@@ -233,7 +233,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
-        tli: &mut Option<Arc<Timeline>>,
+        tli: &mut Option<FullAccessTimeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -323,7 +323,7 @@ struct NetworkReader<'a, IO> {
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
     async fn read_first_message(
         &mut self,
-    ) -> Result<(Arc<Timeline>, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
+    ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
         let next_msg = read_message(self.pgb_reader).await?;
         let tli = match next_msg {
@@ -337,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
+                let tli =
+                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                        .await?;
+                tli.full_access_guard().await?
             }
             _ => {
                 return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
@@ -353,7 +356,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
         msg_tx: Sender<ProposerAcceptorMessage>,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
         next_msg: ProposerAcceptorMessage,
     ) -> Result<(), CopyStreamHandlerEnd> {
         *self.acceptor_handle = Some(WalAcceptor::spawn(
@@ -448,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
 /// replies to reply_tx; reading from socket and writing to disk in parallel is
 /// beneficial for performance, this struct provides writing to disk part.
 pub struct WalAcceptor {
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     msg_rx: Receiver<ProposerAcceptorMessage>,
     reply_tx: Sender<AcceptorProposerMessage>,
     conn_id: Option<ConnectionId>,
@@ -461,7 +464,7 @@ impl WalAcceptor {
     ///
     /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
     pub fn spawn(
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
         msg_rx: Receiver<ProposerAcceptorMessage>,
         reply_tx: Sender<AcceptorProposerMessage>,
         conn_id: Option<ConnectionId>,
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index 568a512c4ac6..80a630b1e120 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -2,7 +2,7 @@
 //! provide it, i.e. safekeeper lags too much.
 
 use std::time::SystemTime;
-use std::{fmt, pin::pin, sync::Arc};
+use std::{fmt, pin::pin};
 
 use anyhow::{bail, Context};
 use futures::StreamExt;
@@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config}
 
 use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
 use crate::safekeeper::{AppendRequest, AppendRequestHeader};
+use crate::timeline::FullAccessTimeline;
 use crate::{
     http::routes::TimelineStatus,
     receive_wal::MSG_QUEUE_SIZE,
@@ -28,14 +29,14 @@ use crate::{
         AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
         TermLsn, VoteRequest,
     },
-    timeline::{PeerInfo, Timeline},
+    timeline::PeerInfo,
     SafeKeeperConf,
 };
 
 /// Entrypoint for per timeline task which always runs, checking whether
 /// recovery for this safekeeper is needed and starting it if so.
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     info!("started");
 
     let cancel = tli.cancel.clone();
@@ -47,6 +48,87 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     }
 }
 
+/// Should we start fetching WAL from a peer safekeeper, and if yes, from
+/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
+/// to fetch, and we can do that without running elections; 2) there is no
+/// actively streaming compute, as we don't want to compete with it.
+///
+/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
+/// to its last_log_term so we are sure such a leader ever had been elected.
+///
+/// All possible donors are returned so that we could keep connection to the
+/// current one if it is good even if it slightly lags behind.
+///
+/// Note that term conditions above might be not met, but safekeepers are
+/// still not aligned on last flush_lsn. Generally in this case until
+/// elections are run it is not possible to say which safekeeper should
+/// recover from which one -- history which would be committed is different
+/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
+/// Thus we don't try to predict it here.
+async fn recovery_needed(
+    tli: &FullAccessTimeline,
+    heartbeat_timeout: Duration,
+) -> RecoveryNeededInfo {
+    let ss = tli.read_shared_state().await;
+    let term = ss.sk.state.acceptor_state.term;
+    let last_log_term = ss.sk.get_last_log_term();
+    let flush_lsn = ss.sk.flush_lsn();
+    // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
+    let mut peers = ss.get_peers(heartbeat_timeout);
+    // Sort by <last log term, lsn> pairs.
+    peers.sort_by(|p1, p2| {
+        let tl1 = TermLsn {
+            term: p1.last_log_term,
+            lsn: p1.flush_lsn,
+        };
+        let tl2 = TermLsn {
+            term: p2.last_log_term,
+            lsn: p2.flush_lsn,
+        };
+        tl2.cmp(&tl1) // desc
+    });
+    let num_streaming_computes = tli.get_walreceivers().get_num_streaming();
+    let donors = if num_streaming_computes > 0 {
+        vec![] // If there is a streaming compute, don't try to recover to not intervene.
+    } else {
+        peers
+            .iter()
+            .filter_map(|candidate| {
+                // Are we interested in this candidate?
+                let candidate_tl = TermLsn {
+                    term: candidate.last_log_term,
+                    lsn: candidate.flush_lsn,
+                };
+                let my_tl = TermLsn {
+                    term: last_log_term,
+                    lsn: flush_lsn,
+                };
+                if my_tl < candidate_tl {
+                    // Yes, we are interested. Can we pull from it without
+                    // (re)running elections? It is possible if 1) his term
+                    // is equal to his last_log_term so we could act on
+                    // behalf of leader of this term (we must be sure he was
+                    // ever elected) and 2) our term is not higher, or we'll refuse data.
+                    if candidate.term == candidate.last_log_term && candidate.term >= term {
+                        Some(Donor::from(candidate))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect()
+    };
+    RecoveryNeededInfo {
+        term,
+        last_log_term,
+        flush_lsn,
+        peers,
+        num_streaming_computes,
+        donors,
+    }
+}
 /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
 /// fields to explain the choice.
 #[derive(Debug)]
@@ -113,10 +195,10 @@ impl From<&PeerInfo> for Donor {
 const CHECK_INTERVAL_MS: u64 = 2000;
 
 /// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
     loop {
-        let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+        let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
         match recovery_needed_info.donors.first() {
             Some(donor) => {
                 info!(
@@ -146,7 +228,7 @@ async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
 /// Recover from the specified donor. Returns message explaining normal finish
 /// reason or error.
 async fn recover(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     donor: &Donor,
     conf: &SafeKeeperConf,
 ) -> anyhow::Result<String> {
@@ -232,7 +314,7 @@ async fn recover(
 
 // Pull WAL from donor, assuming handshake is already done.
 async fn recovery_stream(
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     donor: &Donor,
     start_streaming_at: Lsn,
     conf: &SafeKeeperConf,
@@ -316,7 +398,7 @@ async fn network_io(
     physical_stream: ReplicationStream,
     msg_tx: Sender<ProposerAcceptorMessage>,
     donor: Donor,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     conf: SafeKeeperConf,
 ) -> anyhow::Result<Option<String>> {
     let mut physical_stream = pin!(physical_stream);
@@ -365,7 +447,7 @@ async fn network_io(
             }
             ReplicationMessage::PrimaryKeepAlive(_) => {
                 // keepalive means nothing is being streamed for a while. Check whether we need to stop.
-                let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
+                let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await;
                 // do current donors still contain one we currently connected to?
                 if !recovery_needed_info
                     .donors
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 3400eee9b780..b661e48cb5a6 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -1,41 +1,25 @@
-//! Thread removing old WAL.
+use utils::lsn::Lsn;
 
-use std::time::Duration;
+use crate::timeline_manager::StateSnapshot;
 
-use tokio::time::sleep;
-use tracing::*;
+/// Get oldest LSN we still need to keep. We hold WAL till it is consumed
+/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+/// offloading.
+/// While it is safe to use inmem values for determining horizon,
+/// we use persistent to make possible normal states less surprising.
+/// All segments covering LSNs before horizon_lsn can be removed.
+pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
+    use std::cmp::min;
 
-use crate::{GlobalTimelines, SafeKeeperConf};
-
-pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
-    let wal_removal_interval = Duration::from_millis(5000);
-    loop {
-        let now = tokio::time::Instant::now();
-        let tlis = GlobalTimelines::get_all();
-        for tli in &tlis {
-            let ttid = tli.ttid;
-            async {
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("failed to persist control file: {e}");
-                }
-                if let Err(e) = tli.remove_old_wal().await {
-                    error!("failed to remove WAL: {}", e);
-                }
-            }
-            .instrument(info_span!("WAL removal", ttid = %ttid))
-            .await;
-        }
-
-        let elapsed = now.elapsed();
-        let total_timelines = tlis.len();
-
-        if elapsed > wal_removal_interval {
-            info!(
-                "WAL removal is too long, processed {} timelines in {:?}",
-                total_timelines, elapsed
-            );
-        }
-
-        sleep(wal_removal_interval).await;
+    let mut horizon_lsn = min(
+        state.cfile_remote_consistent_lsn,
+        state.cfile_peer_horizon_lsn,
+    );
+    // we don't want to remove WAL that is not yet offloaded to s3
+    horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
+    if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+        horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
     }
+
+    horizon_lsn
 }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4686c9aa8ea2..563dbbe31581 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -10,7 +10,6 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
-use std::time::Duration;
 use storage_broker::proto::SafekeeperTimelineInfo;
 
 use tracing::*;
@@ -828,24 +827,6 @@ where
         Ok(())
     }
 
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self, force: bool) -> Result<bool> {
-        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
-        if !force && self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(false);
-        }
-        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
-            || self.state.inmem.backup_lsn > self.state.backup_lsn
-            || self.state.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
-            || self.state.inmem.remote_consistent_lsn > self.state.remote_consistent_lsn;
-        if need_persist {
-            self.state.flush().await?;
-            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
-        }
-        Ok(need_persist)
-    }
-
     /// Handle request to append WAL.
     #[allow(clippy::comparison_chain)]
     async fn handle_append_request(
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 5a9745e1c9b2..df75893838ee 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -5,7 +5,7 @@ use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::RECEIVED_PS_FEEDBACKS;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{Term, TermLsn};
-use crate::timeline::Timeline;
+use crate::timeline::FullAccessTimeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
@@ -387,8 +387,10 @@ impl SafekeeperPostgresHandler {
         term: Option<Term>,
     ) -> Result<(), QueryError> {
         let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let full_access = tli.full_access_guard().await?;
+
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
+            .handle_start_replication_guts(pgb, start_pos, term, full_access)
             .await
         {
             let info = tli.get_safekeeper_info(&self.conf).await;
@@ -405,7 +407,7 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
-        tli: Arc<Timeline>,
+        tli: FullAccessTimeline,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
 
@@ -448,14 +450,7 @@ impl SafekeeperPostgresHandler {
         // switch to copy
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
 
-        let (_, persisted_state) = tli.get_state().await;
-        let wal_reader = WalReader::new(
-            self.conf.workdir.clone(),
-            self.conf.timeline_dir(&tli.ttid),
-            &persisted_state,
-            start_pos,
-            self.conf.is_wal_backup_enabled(),
-        )?;
+        let wal_reader = tli.get_walreader(start_pos).await?;
 
         // Split to concurrently receive and send data; replies are generally
         // not synchronized with sends, so this avoids deadlocks.
@@ -532,7 +527,7 @@ impl EndWatch {
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     appname: Option<String>,
     // Position since which we are sending next chunk.
     start_pos: Lsn,
@@ -741,7 +736,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
 struct ReplyReader<IO> {
     reader: PostgresBackendReader<IO>,
     ws_guard: Arc<WalSenderGuard>,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
 }
 
 impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index aa9ccfc21eca..148a7e90bd4e 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,14 +3,14 @@
 
 use anyhow::{anyhow, bail, Result};
 use camino::Utf8PathBuf;
-use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
 use tokio_util::sync::CancellationToken;
+use utils::id::TenantId;
 
 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
@@ -26,7 +26,6 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
 use crate::receive_wal::WalReceivers;
-use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
 use crate::safekeeper::{
     AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
     INVALID_TERM,
@@ -38,8 +37,8 @@ use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
-use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
+use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
+use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -169,7 +168,6 @@ pub struct SharedState {
     pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
     pub(crate) peers_info: PeersInfo,
-    pub(crate) last_removed_segno: XLogSegNo,
 }
 
 impl SharedState {
@@ -197,33 +195,33 @@ impl SharedState {
 
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
-        let timeline_dir = conf.timeline_dir(ttid);
-        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
+        let timeline_dir = get_timeline_dir(conf, ttid);
+        let control_store =
+            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
         let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
 
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
         })
     }
 
     /// Restore SharedState from control file. If file doesn't exist, bails out.
     fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
+        let timeline_dir = get_timeline_dir(conf, ttid);
         let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
         if control_store.server.wal_seg_size == 0 {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
         }
 
         let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
+            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
 
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            last_removed_segno: 0,
         })
     }
 
@@ -275,24 +273,6 @@ impl SharedState {
             .cloned()
             .collect()
     }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
-        let state = &self.sk.state;
-
-        use std::cmp::min;
-        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        // we don't want to remove WAL that is not yet offloaded to s3
-        horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
-            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
-        }
-        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
-    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -349,22 +329,15 @@ pub struct Timeline {
     mutex: RwLock<SharedState>,
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
+    timeline_dir: Utf8PathBuf,
 
     /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
     pub(crate) cancel: CancellationToken,
 
-    /// Directory where timeline state is stored.
-    pub timeline_dir: Utf8PathBuf,
-
-    /// Should we keep WAL on disk for active replication connections.
-    /// Especially useful for sharding, when different shards process WAL
-    /// with different speed.
-    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
-    walsenders_keep_horizon: bool,
-
     // timeline_manager controlled state
     pub(crate) broker_active: AtomicBool,
     pub(crate) wal_backup_active: AtomicBool,
+    pub(crate) last_removed_segno: AtomicU64,
 }
 
 impl Timeline {
@@ -394,10 +367,10 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
         })
     }
 
@@ -430,10 +403,10 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: conf.timeline_dir(&ttid),
-            walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            timeline_dir: get_timeline_dir(conf, &ttid),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
+            last_removed_segno: AtomicU64::new(0),
         })
     }
 
@@ -494,15 +467,6 @@ impl Timeline {
             conf.clone(),
             broker_active_set,
         ));
-
-        // Start recovery task which always runs on the timeline.
-        if conf.peer_recovery_enabled {
-            tokio::spawn(recovery_main(self.clone(), conf.clone()));
-        }
-        // TODO: migrate to timeline_manager
-        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
-            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
-        }
     }
 
     /// Delete timeline from disk completely, by removing timeline directory.
@@ -555,36 +519,6 @@ impl Timeline {
         self.mutex.read().await
     }
 
-    /// Returns true if walsender should stop sending WAL to pageserver. We
-    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
-    /// computes. While there might be nothing to stream already, we learn about
-    /// remote_consistent_lsn update through replication feedback, and we want
-    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
-        if self.is_cancelled() {
-            return true;
-        }
-        let shared_state = self.read_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
-        }
-        false
-    }
-
-    /// Ensure that current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
-        let ss = self.read_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
-            bail!(
-                "failed to acquire term {}, current term {}",
-                t,
-                ss.sk.state.acceptor_state.term
-            );
-        }
-        Ok(ss)
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -600,28 +534,6 @@ impl Timeline {
         self.shared_state_version_rx.clone()
     }
 
-    /// Pass arrived message to the safekeeper.
-    pub async fn process_msg(
-        self: &Arc<Self>,
-        msg: &ProposerAcceptorMessage,
-    ) -> Result<Option<AcceptorProposerMessage>> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        let mut rmsg: Option<AcceptorProposerMessage>;
-        {
-            let mut shared_state = self.write_shared_state().await;
-            rmsg = shared_state.sk.process_msg(msg).await?;
-
-            // if this is AppendResponse, fill in proper hot standby feedback.
-            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
-            }
-        }
-        Ok(rmsg)
-    }
-
     /// Returns wal_seg_size.
     pub async fn get_wal_seg_size(&self) -> usize {
         self.read_shared_state().await.get_wal_seg_size()
@@ -672,97 +584,11 @@ impl Timeline {
         Ok(())
     }
 
-    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
-        let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state.inmem.remote_consistent_lsn =
-            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
-    }
-
     pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
         let shared_state = self.read_shared_state().await;
         shared_state.get_peers(conf.heartbeat_timeout)
     }
 
-    /// Should we start fetching WAL from a peer safekeeper, and if yes, from
-    /// which? Answer is yes, i.e. .donors is not empty if 1) there is something
-    /// to fetch, and we can do that without running elections; 2) there is no
-    /// actively streaming compute, as we don't want to compete with it.
-    ///
-    /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
-    /// to its last_log_term so we are sure such a leader ever had been elected.
-    ///
-    /// All possible donors are returned so that we could keep connection to the
-    /// current one if it is good even if it slightly lags behind.
-    ///
-    /// Note that term conditions above might be not met, but safekeepers are
-    /// still not aligned on last flush_lsn. Generally in this case until
-    /// elections are run it is not possible to say which safekeeper should
-    /// recover from which one -- history which would be committed is different
-    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
-    /// Thus we don't try to predict it here.
-    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.read_shared_state().await;
-        let term = ss.sk.state.acceptor_state.term;
-        let last_log_term = ss.sk.get_last_log_term();
-        let flush_lsn = ss.sk.flush_lsn();
-        // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
-        let mut peers = ss.get_peers(heartbeat_timeout);
-        // Sort by <last log term, lsn> pairs.
-        peers.sort_by(|p1, p2| {
-            let tl1 = TermLsn {
-                term: p1.last_log_term,
-                lsn: p1.flush_lsn,
-            };
-            let tl2 = TermLsn {
-                term: p2.last_log_term,
-                lsn: p2.flush_lsn,
-            };
-            tl2.cmp(&tl1) // desc
-        });
-        let num_streaming_computes = self.walreceivers.get_num_streaming();
-        let donors = if num_streaming_computes > 0 {
-            vec![] // If there is a streaming compute, don't try to recover to not intervene.
-        } else {
-            peers
-                .iter()
-                .filter_map(|candidate| {
-                    // Are we interested in this candidate?
-                    let candidate_tl = TermLsn {
-                        term: candidate.last_log_term,
-                        lsn: candidate.flush_lsn,
-                    };
-                    let my_tl = TermLsn {
-                        term: last_log_term,
-                        lsn: flush_lsn,
-                    };
-                    if my_tl < candidate_tl {
-                        // Yes, we are interested. Can we pull from it without
-                        // (re)running elections? It is possible if 1) his term
-                        // is equal to his last_log_term so we could act on
-                        // behalf of leader of this term (we must be sure he was
-                        // ever elected) and 2) our term is not higher, or we'll refuse data.
-                        if candidate.term == candidate.last_log_term && candidate.term >= term {
-                            Some(Donor::from(candidate))
-                        } else {
-                            None
-                        }
-                    } else {
-                        None
-                    }
-                })
-                .collect()
-        };
-        RecoveryNeededInfo {
-            term,
-            last_log_term,
-            flush_lsn,
-            peers,
-            num_streaming_computes,
-            donors,
-        }
-    }
-
     pub fn get_walsenders(&self) -> &Arc<WalSenders> {
         &self.walsenders
     }
@@ -776,58 +602,6 @@ impl Timeline {
         self.read_shared_state().await.sk.wal_store.flush_lsn()
     }
 
-    /// Delete WAL segments from disk that are no longer needed. This is determined
-    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-
-        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
-        // This allows to get better read speed for pageservers that are lagging behind,
-        // at the cost of keeping more WAL on disk.
-        let replication_horizon_lsn = if self.walsenders_keep_horizon {
-            self.walsenders.laggard_lsn()
-        } else {
-            None
-        };
-
-        let horizon_segno: XLogSegNo;
-        let remover = {
-            let shared_state = self.read_shared_state().await;
-            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
-            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
-                return Ok(()); // nothing to do
-            }
-
-            // release the lock before removing
-            shared_state.sk.wal_store.remove_up_to(horizon_segno - 1)
-        };
-
-        // delete old WAL files
-        remover.await?;
-
-        // update last_removed_segno
-        let mut shared_state = self.write_shared_state().await;
-        if shared_state.last_removed_segno != horizon_segno {
-            shared_state.last_removed_segno = horizon_segno;
-        } else {
-            shared_state.skip_update = true;
-        }
-        Ok(())
-    }
-
-    /// Persist control file if there is something to save and enough time
-    /// passed after the last save. This helps to keep remote_consistent_lsn up
-    /// to date so that storage nodes restart doesn't cause many pageserver ->
-    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>, force: bool) -> Result<()> {
-        let mut guard = self.write_shared_state().await;
-        let changed = guard.sk.maybe_persist_inmem_control_file(force).await?;
-        guard.skip_update = !changed;
-        Ok(())
-    }
-
     /// Gather timeline data for metrics.
     pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
         if self.is_cancelled() {
@@ -843,7 +617,7 @@ impl Timeline {
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
             epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             persisted_state: state.sk.state.clone(),
@@ -866,7 +640,7 @@ impl Timeline {
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
+            last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
             epoch_start_lsn: state.sk.term_start_lsn,
             mem_state: state.sk.state.inmem.clone(),
             write_lsn,
@@ -889,6 +663,110 @@ impl Timeline {
         state.sk.state.finish_change(&persistent_state).await?;
         Ok(res)
     }
+
+    /// Get the timeline guard for reading/writing WAL files.
+    /// TODO: if WAL files are not present on disk (evicted), they will be
+    /// downloaded from S3. Also there will logic for preventing eviction
+    /// while someone is holding FullAccessTimeline guard.
+    pub async fn full_access_guard(self: &Arc<Self>) -> Result<FullAccessTimeline> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        Ok(FullAccessTimeline { tli: self.clone() })
+    }
+}
+
+/// This is a guard that allows to read/write disk timeline state.
+/// All tasks that are using the disk should use this guard.
+#[derive(Clone)]
+pub struct FullAccessTimeline {
+    pub tli: Arc<Timeline>,
+}
+
+impl Deref for FullAccessTimeline {
+    type Target = Arc<Timeline>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.tli
+    }
+}
+
+impl FullAccessTimeline {
+    /// Returns true if walsender should stop sending WAL to pageserver. We
+    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
+    /// computes. While there might be nothing to stream already, we learn about
+    /// remote_consistent_lsn update through replication feedback, and we want
+    /// to stop pushing to the broker if pageserver is fully caughtup.
+    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
+        if self.is_cancelled() {
+            return true;
+        }
+        let shared_state = self.read_shared_state().await;
+        if self.walreceivers.get_num() == 0 {
+            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
+            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
+        }
+        false
+    }
+
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
+    /// Pass arrived message to the safekeeper.
+    pub async fn process_msg(
+        &self,
+        msg: &ProposerAcceptorMessage,
+    ) -> Result<Option<AcceptorProposerMessage>> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+
+        let mut rmsg: Option<AcceptorProposerMessage>;
+        {
+            let mut shared_state = self.write_shared_state().await;
+            rmsg = shared_state.sk.process_msg(msg).await?;
+
+            // if this is AppendResponse, fill in proper hot standby feedback.
+            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
+            }
+        }
+        Ok(rmsg)
+    }
+
+    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
+        let (_, persisted_state) = self.get_state().await;
+        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
+
+        WalReader::new(
+            &self.ttid,
+            self.timeline_dir.clone(),
+            &persisted_state,
+            start_lsn,
+            enable_remote_read,
+        )
+    }
+
+    pub fn get_timeline_dir(&self) -> Utf8PathBuf {
+        self.timeline_dir.clone()
+    }
+
+    /// Update in memory remote consistent lsn.
+    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+        let mut shared_state = self.write_shared_state().await;
+        shared_state.sk.state.inmem.remote_consistent_lsn =
+            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
+    }
 }
 
 /// Deletes directory and it's contents. Returns false if directory does not exist.
@@ -899,3 +777,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
         Err(e) => Err(e.into()),
     }
 }
+
+/// Get a path to the tenant directory. If you just need to get a timeline directory,
+/// use FullAccessTimeline::get_timeline_dir instead.
+pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
+    conf.workdir.join(tenant_id.to_string())
+}
+
+/// Get a path to the timeline directory. If you need to read WAL files from disk,
+/// use FullAccessTimeline::get_timeline_dir instead. This function does not check
+/// timeline eviction status and WAL files might not be present on disk.
+pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
+    get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
+}
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index ed544352f984..84862207d5bf 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -3,23 +3,42 @@
 //! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
 //! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
 
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, Instant},
+};
 
+use postgres_ffi::XLogSegNo;
+use tokio::task::{JoinError, JoinHandle};
 use tracing::{info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
+    control_file::Storage,
     metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    recovery::recovery_main,
+    remove_wal::calc_horizon_lsn,
+    send_wal::WalSenders,
     timeline::{PeerInfo, ReadGuardSharedState, Timeline},
-    timelines_set::TimelinesSet,
+    timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    SafeKeeperConf,
+    wal_backup_partial, SafeKeeperConf,
 };
 
 pub struct StateSnapshot {
+    // inmem values
     pub commit_lsn: Lsn,
     pub backup_lsn: Lsn,
     pub remote_consistent_lsn: Lsn,
+
+    // persistent control file values
+    pub cfile_peer_horizon_lsn: Lsn,
+    pub cfile_remote_consistent_lsn: Lsn,
+    pub cfile_backup_lsn: Lsn,
+
+    // misc
+    pub cfile_last_persist_at: Instant,
+    pub inmem_flush_pending: bool,
     pub peers: Vec<PeerInfo>,
 }
 
@@ -30,17 +49,34 @@ impl StateSnapshot {
             commit_lsn: read_guard.sk.state.inmem.commit_lsn,
             backup_lsn: read_guard.sk.state.inmem.backup_lsn,
             remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn,
+            cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn,
+            cfile_backup_lsn: read_guard.sk.state.backup_lsn,
+            cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(),
+            inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard),
             peers: read_guard.get_peers(heartbeat_timeout),
         }
     }
+
+    fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool {
+        let state = &read_guard.sk.state;
+        state.inmem.commit_lsn > state.commit_lsn
+            || state.inmem.backup_lsn > state.backup_lsn
+            || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
+            || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
+    }
 }
 
 /// Control how often the manager task should wake up to check updates.
 /// There is no need to check for updates more often than this.
 const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
 
+/// How often to save the control file if the is no other activity.
+const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
 /// background tasks.
+/// Be careful, this task is not respawned on panic, so it should not panic.
 #[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
 pub async fn main_task(
     tli: Arc<Timeline>,
@@ -55,20 +91,50 @@ pub async fn main_task(
         }
     };
 
-    // sets whether timeline is active for broker pushes or not
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-
-    let ttid = tli.ttid;
+    // configuration & dependencies
     let wal_seg_size = tli.get_wal_seg_size().await;
     let heartbeat_timeout = conf.heartbeat_timeout;
+    let walsenders = tli.get_walsenders();
+    let walreceivers = tli.get_walreceivers();
 
+    // current state
     let mut state_version_rx = tli.get_state_version_rx();
-
-    let walreceivers = tli.get_walreceivers();
     let mut num_computes_rx = walreceivers.get_num_rx();
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+    let mut last_removed_segno = 0 as XLogSegNo;
 
     // list of background tasks
     let mut backup_task: Option<WalBackupTaskHandle> = None;
+    let mut recovery_task: Option<JoinHandle<()>> = None;
+    let mut partial_backup_task: Option<JoinHandle<()>> = None;
+    let mut wal_removal_task: Option<JoinHandle<anyhow::Result<u64>>> = None;
+
+    // Start recovery task which always runs on the timeline.
+    if conf.peer_recovery_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone())));
+            }
+            Err(e) => {
+                warn!("failed to start recovery task: {:?}", e);
+            }
+        }
+    }
+
+    // Start partial backup task which always runs on the timeline.
+    if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
+        match tli.full_access_guard().await {
+            Ok(tli) => {
+                partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
+                    tli,
+                    conf.clone(),
+                )));
+            }
+            Err(e) => {
+                warn!("failed to start partial backup task: {:?}", e);
+            }
+        }
+    }
 
     let last_state = 'outer: loop {
         MANAGER_ITERATIONS_TOTAL.inc();
@@ -76,47 +142,36 @@ pub async fn main_task(
         let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
         let num_computes = *num_computes_rx.borrow();
 
-        let is_wal_backup_required =
-            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
-
-        if conf.is_wal_backup_enabled() {
-            wal_backup::update_task(
-                &conf,
-                ttid,
-                is_wal_backup_required,
-                &state_snapshot,
-                &mut backup_task,
-            )
-            .await;
-        }
-
-        let is_active = is_wal_backup_required
-            || num_computes > 0
-            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
-
-        // update the broker timeline set
-        if tli_broker_active.set(is_active) {
-            // write log if state has changed
-            info!(
-                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
-            );
+        let is_wal_backup_required = update_backup(
+            &conf,
+            &tli,
+            wal_seg_size,
+            num_computes,
+            &state_snapshot,
+            &mut backup_task,
+        )
+        .await;
 
-            MANAGER_ACTIVE_CHANGES.inc();
+        let _is_active = update_is_active(
+            is_wal_backup_required,
+            num_computes,
+            &state_snapshot,
+            &mut tli_broker_active,
+            &tli,
+        );
 
-            if !is_active {
-                // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file(false).await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
+        let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await;
 
-        // update the state in Arc<Timeline>
-        tli.wal_backup_active
-            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-        tli.broker_active
-            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+        update_wal_removal(
+            &conf,
+            walsenders,
+            &tli,
+            wal_seg_size,
+            &state_snapshot,
+            last_removed_segno,
+            &mut wal_removal_task,
+        )
+        .await;
 
         // wait until something changes. tx channels are stored under Arc, so they will not be
         // dropped until the manager task is finished.
@@ -135,11 +190,189 @@ pub async fn main_task(
             _ = num_computes_rx.changed() => {
                 // number of connected computes was updated
             }
+            _ = async {
+                if let Some(timeout) = next_cfile_save {
+                    tokio::time::sleep_until(timeout).await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // it's time to save the control file
+            }
+            res = async {
+                if let Some(task) = &mut wal_removal_task {
+                    task.await
+                } else {
+                    futures::future::pending().await
+                }
+            } => {
+                // WAL removal task finished
+                wal_removal_task = None;
+                update_wal_removal_end(res, &tli, &mut last_removed_segno);
+            }
         }
     };
 
     // shutdown background tasks
     if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+        wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await;
+    }
+
+    if let Some(recovery_task) = recovery_task {
+        if let Err(e) = recovery_task.await {
+            warn!("recovery task failed: {:?}", e);
+        }
+    }
+
+    if let Some(partial_backup_task) = partial_backup_task {
+        if let Err(e) = partial_backup_task.await {
+            warn!("partial backup task failed: {:?}", e);
+        }
+    }
+
+    if let Some(wal_removal_task) = wal_removal_task {
+        let res = wal_removal_task.await;
+        update_wal_removal_end(res, &tli, &mut last_removed_segno);
+    }
+}
+
+/// Spawns/kills backup task and returns true if backup is required.
+async fn update_backup(
+    conf: &SafeKeeperConf,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+    backup_task: &mut Option<WalBackupTaskHandle>,
+) -> bool {
+    let is_wal_backup_required =
+        wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state);
+
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await;
+    }
+
+    // update the state in Arc<Timeline>
+    tli.wal_backup_active
+        .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+    is_wal_backup_required
+}
+
+/// Update is_active flag and returns its value.
+fn update_is_active(
+    is_wal_backup_required: bool,
+    num_computes: usize,
+    state: &StateSnapshot,
+    tli_broker_active: &mut TimelineSetGuard,
+    tli: &Arc<Timeline>,
+) -> bool {
+    let is_active = is_wal_backup_required
+        || num_computes > 0
+        || state.remote_consistent_lsn < state.commit_lsn;
+
+    // update the broker timeline set
+    if tli_broker_active.set(is_active) {
+        // write log if state has changed
+        info!(
+            "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+            is_active, state.remote_consistent_lsn, state.commit_lsn,
+        );
+
+        MANAGER_ACTIVE_CHANGES.inc();
     }
+
+    // update the state in Arc<Timeline>
+    tli.broker_active
+        .store(is_active, std::sync::atomic::Ordering::Relaxed);
+    is_active
+}
+
+/// Save control file if needed. Returns Instant if we should persist the control file in the future.
+async fn update_control_file_save(
+    state: &StateSnapshot,
+    tli: &Arc<Timeline>,
+) -> Option<tokio::time::Instant> {
+    if !state.inmem_flush_pending {
+        return None;
+    }
+
+    if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL {
+        let mut write_guard = tli.write_shared_state().await;
+        // this can be done in the background because it blocks manager task, but flush() should
+        // be fast enough not to be a problem now
+        if let Err(e) = write_guard.sk.state.flush().await {
+            warn!("failed to save control file: {:?}", e);
+        }
+
+        None
+    } else {
+        // we should wait until next CF_SAVE_INTERVAL
+        Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into())
+    }
+}
+
+/// Spawns WAL removal task if needed.
+async fn update_wal_removal(
+    conf: &SafeKeeperConf,
+    walsenders: &Arc<WalSenders>,
+    tli: &Arc<Timeline>,
+    wal_seg_size: usize,
+    state: &StateSnapshot,
+    last_removed_segno: u64,
+    wal_removal_task: &mut Option<JoinHandle<anyhow::Result<u64>>>,
+) {
+    if wal_removal_task.is_some() {
+        // WAL removal is already in progress
+        return;
+    }
+
+    // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+    // This allows to get better read speed for pageservers that are lagging behind,
+    // at the cost of keeping more WAL on disk.
+    let replication_horizon_lsn = if conf.walsenders_keep_horizon {
+        walsenders.laggard_lsn()
+    } else {
+        None
+    };
+
+    let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn);
+    let removal_horizon_segno = removal_horizon_lsn
+        .segment_number(wal_seg_size)
+        .saturating_sub(1);
+
+    if removal_horizon_segno > last_removed_segno {
+        // we need to remove WAL
+        let remover = crate::wal_storage::Storage::remove_up_to(
+            &tli.read_shared_state().await.sk.wal_store,
+            removal_horizon_segno,
+        );
+        *wal_removal_task = Some(tokio::spawn(async move {
+            remover.await?;
+            Ok(removal_horizon_segno)
+        }));
+    }
+}
+
+/// Update the state after WAL removal task finished.
+fn update_wal_removal_end(
+    res: Result<anyhow::Result<u64>, JoinError>,
+    tli: &Arc<Timeline>,
+    last_removed_segno: &mut u64,
+) {
+    let new_last_removed_segno = match res {
+        Ok(Ok(segno)) => segno,
+        Err(e) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+        Ok(Err(e)) => {
+            warn!("WAL removal task failed: {:?}", e);
+            return;
+        }
+    };
+
+    *last_removed_segno = new_last_removed_segno;
+    // update the state in Arc<Timeline>
+    tli.last_removed_segno
+        .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed);
 }
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 8d37bd637150..45e08ede3c0a 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -3,7 +3,7 @@
 //! all from the disk on startup and keeping them in memory.
 
 use crate::safekeeper::ServerInfo;
-use crate::timeline::{Timeline, TimelineError};
+use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
@@ -127,7 +127,7 @@ impl GlobalTimelines {
             state.get_dependencies()
         };
 
-        let timelines_dir = conf.tenant_dir(&tenant_id);
+        let timelines_dir = get_tenant_dir(&conf, &tenant_id);
         for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
             .with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
         {
@@ -348,11 +348,7 @@ impl GlobalTimelines {
             }
             Err(_) => {
                 // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .get_conf()
-                    .timeline_dir(ttid);
+                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
                 let dir_existed = delete_dir(dir_path)?;
 
                 Ok(TimelineDeleteForceResult {
@@ -401,13 +397,10 @@ impl GlobalTimelines {
         // Note that we could concurrently create new timelines while we were deleting them,
         // so the directory may be not empty. In this case timelines will have bad state
         // and timeline background jobs can panic.
-        delete_dir(
-            TIMELINES_STATE
-                .lock()
-                .unwrap()
-                .get_conf()
-                .tenant_dir(tenant_id),
-        )?;
+        delete_dir(get_tenant_dir(
+            TIMELINES_STATE.lock().unwrap().get_conf(),
+            tenant_id,
+        ))?;
 
         // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
         // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 84680557f9a9..58591aecfa73 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -30,9 +30,9 @@ use tracing::*;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
-use crate::timeline::{PeerInfo, Timeline};
+use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline};
 use crate::timeline_manager::StateSnapshot;
-use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
+use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
 
@@ -63,13 +63,13 @@ pub fn is_wal_backup_required(
 /// is running, kill it.
 pub async fn update_task(
     conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
+    tli: &Arc<Timeline>,
     need_backup: bool,
     state: &StateSnapshot,
     entry: &mut Option<WalBackupTaskHandle>,
 ) {
     let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+        determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf);
     let elected_me = Some(conf.my_id) == offloader;
 
     let should_task_run = need_backup && elected_me;
@@ -80,15 +80,8 @@ pub async fn update_task(
             info!("elected for backup: {}", election_dbg_str);
 
             let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
 
-            let async_task = backup_task_main(
-                ttid,
-                timeline_dir,
-                conf.workdir.clone(),
-                conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
+            let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx);
 
             let handle = if conf.current_thread_runtime {
                 tokio::spawn(async_task)
@@ -198,39 +191,32 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
 }
 
 struct WalBackupTask {
-    timeline: Arc<Timeline>,
+    timeline: FullAccessTimeline,
     timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
     wal_seg_size: usize,
     parallel_jobs: usize,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }
 
 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
-async fn backup_task_main(
-    ttid: TenantTimelineId,
-    timeline_dir: Utf8PathBuf,
-    workspace_dir: Utf8PathBuf,
-    parallel_jobs: usize,
-    mut shutdown_rx: Receiver<()>,
-) {
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))]
+async fn backup_task_main(tli: Arc<Timeline>, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) {
     let _guard = WAL_BACKUP_TASKS.guard();
 
+    let tli = match tli.full_access_guard().await {
+        Ok(tli) => tli,
+        Err(e) => {
+            error!("backup error: {}", e);
+            return;
+        }
+    };
     info!("started");
-    let res = GlobalTimelines::get(ttid);
-    if let Err(e) = res {
-        error!("backup error: {}", e);
-        return;
-    }
-    let tli = res.unwrap();
 
     let mut wb = WalBackupTask {
         wal_seg_size: tli.get_wal_seg_size().await,
         commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
+        timeline_dir: tli.get_timeline_dir(),
         timeline: tli,
-        timeline_dir,
-        workspace_dir,
         parallel_jobs,
     };
 
@@ -297,7 +283,6 @@ impl WalBackupTask {
                 commit_lsn,
                 self.wal_seg_size,
                 &self.timeline_dir,
-                &self.workspace_dir,
                 self.parallel_jobs,
             )
             .await
@@ -319,18 +304,18 @@ impl WalBackupTask {
 }
 
 async fn backup_lsn_range(
-    timeline: &Arc<Timeline>,
+    timeline: &FullAccessTimeline,
     backup_lsn: &mut Lsn,
     end_lsn: Lsn,
     wal_seg_size: usize,
     timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
     parallel_jobs: usize,
 ) -> Result<()> {
     if parallel_jobs < 1 {
         anyhow::bail!("parallel_jobs must be >= 1");
     }
 
+    let remote_timeline_path = remote_timeline_path(&timeline.ttid)?;
     let start_lsn = *backup_lsn;
     let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
 
@@ -343,7 +328,11 @@ async fn backup_lsn_range(
     loop {
         let added_task = match iter.next() {
             Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir));
+                uploads.push_back(backup_single_segment(
+                    s,
+                    timeline_dir,
+                    &remote_timeline_path,
+                ));
                 true
             }
             None => false,
@@ -381,18 +370,10 @@ async fn backup_lsn_range(
 async fn backup_single_segment(
     seg: &Segment,
     timeline_dir: &Utf8Path,
-    workspace_dir: &Utf8Path,
+    remote_timeline_path: &RemotePath,
 ) -> Result<Segment> {
     let segment_file_path = seg.file_path(timeline_dir)?;
-    let remote_segment_path = segment_file_path
-        .strip_prefix(workspace_dir)
-        .context("Failed to strip workspace dir prefix")
-        .and_then(RemotePath::new)
-        .with_context(|| {
-            format!(
-                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
-            )
-        })?;
+    let remote_segment_path = seg.remote_path(remote_timeline_path);
 
     let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
     if res.is_ok() {
@@ -430,6 +411,10 @@ impl Segment {
         Ok(timeline_dir.join(self.object_name()))
     }
 
+    pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath {
+        remote_timeline_path.join(self.object_name())
+    }
+
     pub fn size(self) -> usize {
         (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize
     }
@@ -530,8 +515,7 @@ pub async fn read_object(
 /// when called.
 pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     let storage = get_configured_remote_storage();
-    let ttid_path = Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string());
-    let remote_path = RemotePath::new(&ttid_path)?;
+    let remote_path = remote_timeline_path(ttid)?;
 
     // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
     // const Option unwrap is not stable, otherwise it would be const.
@@ -613,15 +597,17 @@ pub async fn copy_s3_segments(
         .as_ref()
         .unwrap();
 
-    let relative_dst_path =
-        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
-
-    let remote_path = RemotePath::new(&relative_dst_path)?;
+    let remote_dst_path = remote_timeline_path(dst_ttid)?;
 
     let cancel = CancellationToken::new();
 
     let files = storage
-        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .list(
+            Some(&remote_dst_path),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
         .await?
         .keys;
 
@@ -635,9 +621,6 @@ pub async fn copy_s3_segments(
         uploaded_segments
     );
 
-    let relative_src_path =
-        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
-
     for segno in from_segment..to_segment {
         if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
             info!("copied all segments from {} until {}", from_segment, segno);
@@ -649,8 +632,8 @@ pub async fn copy_s3_segments(
         }
         debug!("copying segment {}", segment_name);
 
-        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
-        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+        let from = remote_timeline_path(src_ttid)?.join(&segment_name);
+        let to = remote_dst_path.join(&segment_name);
 
         storage.copy_object(&from, &to, &cancel).await?;
     }
@@ -661,3 +644,8 @@ pub async fn copy_s3_segments(
     );
     Ok(())
 }
+
+/// Get S3 (remote_storage) prefix path used for timeline files.
+pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result<RemotePath> {
+    RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string()))
+}
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 29e944bff37b..a320be3badf5 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,8 +18,6 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
-use std::sync::Arc;
-
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use rand::Rng;
@@ -32,8 +30,9 @@ use utils::lsn::Lsn;
 use crate::{
     metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     safekeeper::Term,
-    timeline::Timeline,
-    wal_backup, SafeKeeperConf,
+    timeline::FullAccessTimeline,
+    wal_backup::{self, remote_timeline_path},
+    SafeKeeperConf,
 };
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
@@ -83,10 +82,10 @@ impl State {
 
 struct PartialBackup {
     wal_seg_size: usize,
-    tli: Arc<Timeline>,
+    tli: FullAccessTimeline,
     conf: SafeKeeperConf,
     local_prefix: Utf8PathBuf,
-    remote_prefix: Utf8PathBuf,
+    remote_timeline_path: RemotePath,
 
     state: State,
 }
@@ -153,7 +152,7 @@ impl PartialBackup {
         let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size);
 
         let local_path = self.local_prefix.join(self.local_segment_name(segno));
-        let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?;
+        let remote_path = self.remote_timeline_path.join(&prepared.name);
 
         // Upload first `backup_bytes` bytes of the segment to the remote storage.
         wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
@@ -253,7 +252,7 @@ impl PartialBackup {
         info!("deleting objects: {:?}", segments_to_delete);
         let mut objects_to_delete = vec![];
         for seg in segments_to_delete.iter() {
-            let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?;
+            let remote_path = self.remote_timeline_path.join(seg);
             objects_to_delete.push(remote_path);
         }
 
@@ -273,7 +272,7 @@ impl PartialBackup {
 }
 
 #[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
+pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
@@ -289,11 +288,11 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
     let wal_seg_size = tli.get_wal_seg_size().await;
 
-    let local_prefix = tli.timeline_dir.clone();
-    let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) {
-        Ok(path) => path.to_owned(),
+    let local_prefix = tli.get_timeline_dir();
+    let remote_timeline_path = match remote_timeline_path(&tli.ttid) {
+        Ok(path) => path,
         Err(e) => {
-            error!("failed to strip workspace dir prefix: {:?}", e);
+            error!("failed to create remote path: {:?}", e);
             return;
         }
     };
@@ -304,7 +303,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         state: persistent_state.partial_backup,
         conf,
         local_prefix,
-        remote_prefix,
+        remote_timeline_path,
     };
 
     debug!("state: {:?}", backup.state);
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6bc8c7c3f97b..45e27e19519c 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -25,7 +25,7 @@ use utils::crashsafe::durable_rename;
 
 use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::read_object;
+use crate::wal_backup::{read_object, remote_timeline_path};
 use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
@@ -536,7 +536,7 @@ async fn remove_segments_from_disk(
 }
 
 pub struct WalReader {
-    workdir: Utf8PathBuf,
+    remote_path: RemotePath,
     timeline_dir: Utf8PathBuf,
     wal_seg_size: usize,
     pos: Lsn,
@@ -558,7 +558,7 @@ pub struct WalReader {
 
 impl WalReader {
     pub fn new(
-        workdir: Utf8PathBuf,
+        ttid: &TenantTimelineId,
         timeline_dir: Utf8PathBuf,
         state: &TimelinePersistentState,
         start_pos: Lsn,
@@ -586,7 +586,7 @@ impl WalReader {
         }
 
         Ok(Self {
-            workdir,
+            remote_path: remote_timeline_path(ttid)?,
             timeline_dir,
             wal_seg_size: state.server.wal_seg_size as usize,
             pos: start_pos,
@@ -684,7 +684,7 @@ impl WalReader {
         let xlogoff = self.pos.segment_offset(self.wal_seg_size);
         let segno = self.pos.segment_number(self.wal_seg_size);
         let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size);
-        let wal_file_path = self.timeline_dir.join(wal_file_name);
+        let wal_file_path = self.timeline_dir.join(&wal_file_name);
 
         // Try to open local file, if we may have WAL locally
         if self.pos >= self.local_start_lsn {
@@ -712,16 +712,7 @@ impl WalReader {
 
         // Try to open remote file, if remote reads are enabled
         if self.enable_remote_read {
-            let remote_wal_file_path = wal_file_path
-                .strip_prefix(&self.workdir)
-                .context("Failed to strip workdir prefix")
-                .and_then(RemotePath::new)
-                .with_context(|| {
-                    format!(
-                        "Failed to resolve remote part of path {:?} for base {:?}",
-                        wal_file_path, self.workdir,
-                    )
-                })?;
+            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
             return read_object(&remote_wal_file_path, xlogoff as u64).await;
         }
 
diff --git a/test_runner/fixtures/common_types.py b/test_runner/fixtures/common_types.py
index e9be7656694e..147264762cc4 100644
--- a/test_runner/fixtures/common_types.py
+++ b/test_runner/fixtures/common_types.py
@@ -72,6 +72,18 @@ def as_int(self) -> int:
     def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn":
         return Lsn(self.lsn_int - (self.lsn_int % seg_sz))
 
+    def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int:
+        return self.lsn_int // seg_sz
+
+    def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str:
+        segno = self.segno(seg_sz)
+        # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex.
+        # XXXXXXXX is the higher 8 hex digits of segno
+        high_bits = segno >> 8
+        # YY is the lower 2 hex digits of segno
+        low_bits = segno & 0xFF
+        return f"00000001{high_bits:08X}000000{low_bits:02X}"
+
 
 @dataclass(frozen=True)
 class Key:
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b8ef63faa98a..0004745bf090 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -973,6 +973,9 @@ def __exit__(
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+            for safekeeper in self.env.safekeepers:
+                safekeeper.assert_no_errors()
+
             self.env.storage_controller.assert_no_errors()
 
         try:
@@ -3813,6 +3816,9 @@ def stop(self, immediate: bool = False) -> "Safekeeper":
         self.running = False
         return self
 
+    def assert_no_errors(self):
+        assert not self.log_contains("manager task finished prematurely")
+
     def append_logical_message(
         self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any]
     ) -> Dict[str, Any]:
@@ -3898,6 +3904,15 @@ def checkpoint_up_to(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Ls
         """
         cli = self.http_client()
 
+        target_segment_file = lsn.segment_name()
+
+        def are_segments_removed():
+            segments = self.list_segments(tenant_id, timeline_id)
+            log.info(
+                f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}"
+            )
+            assert all(target_segment_file <= s for s in segments)
+
         def are_lsns_advanced():
             stat = cli.timeline_status(tenant_id, timeline_id)
             log.info(
@@ -3909,6 +3924,7 @@ def are_lsns_advanced():
         # pageserver to this safekeeper
         wait_until(30, 1, are_lsns_advanced)
         cli.checkpoint(tenant_id, timeline_id)
+        wait_until(30, 1, are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"

From 87afbf6b24313cbfa28809ec7ada2e72911263be Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 31 May 2024 12:00:40 -0400
Subject: [PATCH 25/31] test(pageserver): add test interface to create
 artificial layers (#7899)

This pull request adds necessary interfaces to deterministically create
scenarios we want to test. Simplify some test cases to use this
interface to make it stable + reproducible.

Compaction test will be able to use this interface. Also the upcoming
delete tombstone tests will use this interface to make test
reproducible.

## Summary of changes

* `force_create_image_layer`
* `force_create_delta_layer`
* `force_advance_lsn`
* `create_test_timeline_with_states`
* `branch_timeline_test_with_states`

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs                      | 261 ++++++++----------
 pageserver/src/tenant/timeline.rs             |  96 +++++++
 .../src/tenant/timeline/layer_manager.rs      |   7 +
 3 files changed, 223 insertions(+), 141 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0a9637884fae..cfa683beb860 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1393,6 +1393,36 @@ impl Tenant {
         Ok(tl)
     }
 
+    /// Helper for unit tests to create a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn create_test_timeline_with_layers(
+        &self,
+        new_timeline_id: TimelineId,
+        initdb_lsn: Lsn,
+        pg_version: u32,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
+            .await?;
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
     /// Create a new timeline.
     ///
     /// Returns the new timeline ID and reference to its Timeline object.
@@ -2992,17 +3022,53 @@ impl Tenant {
         &self,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
-        start_lsn: Option<Lsn>,
+        ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
+            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
             .await?;
         tl.set_state(TimelineState::Active);
         Ok(tl)
     }
 
+    /// Helper for unit tests to branch a timeline with some pre-loaded states.
+    #[cfg(test)]
+    #[allow(clippy::too_many_arguments)]
+    pub async fn branch_timeline_test_with_layers(
+        &self,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
+        ancestor_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
+        end_lsn: Lsn,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let tline = self
+            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
+            .await?;
+        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
+            ancestor_lsn
+        } else {
+            tline.get_last_record_lsn()
+        };
+        assert!(end_lsn >= ancestor_lsn);
+        tline.force_advance_lsn(end_lsn);
+        for deltas in delta_layer_desc {
+            tline
+                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        for (lsn, images) in image_layer_desc {
+            tline
+                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
+                .await?;
+        }
+        Ok(tline)
+    }
+
     /// Branch an existing timeline.
     ///
     /// The caller is responsible for activating the returned timeline.
@@ -6206,75 +6272,36 @@ mod tests {
     async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
         let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        let cancel = CancellationToken::new();
 
         let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
         let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
         let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
 
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create a image layer
-        }
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
+            .await?;
 
         let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
+                Lsn(0x30),
+            )
             .await
             .unwrap();
 
-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("data key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?; // this will create a delta
-
-            {
-                // update the partitioning to include the test key space, otherwise they
-                // will be dropped by image layer creation
-                let mut guard = child.partitioning.lock().await;
-                let ((partitioning, _), partition_lsn) = &mut *guard;
-                partitioning
-                    .parts
-                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
-                *partition_lsn = lsn;
-            }
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
-        }
-
         async fn get_vectored_impl_wrapper(
             tline: &Arc<Timeline>,
             key: Key,
@@ -6296,6 +6323,8 @@ mod tests {
             }))
         }
 
+        let lsn = Lsn(0x30);
+
         // test vectored get on parent timeline
         assert_eq!(
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
@@ -6333,94 +6362,42 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
         let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
 
-        let cancel = CancellationToken::new();
-
-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
-        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
-        base_key_child.field1 = AUX_KEY_PREFIX;
-        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
 
-        let mut lsn = Lsn(0x20);
-
-        {
-            let mut writer = tline.writer().await;
-            writer
-                .put(
-                    base_key,
-                    lsn,
-                    &Value::Image(test_img("metadata key 1")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            tline.freeze_and_flush().await?; // this will create an image layer
-
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
+            )
+            .await?;
 
         let child = tenant
-            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .branch_timeline_test_with_layers(
+                &tline,
+                NEW_TIMELINE_ID,
+                Some(Lsn(0x20)),
+                &ctx,
+                Vec::new(), // delta layers
+                vec![(
+                    Lsn(0x30),
+                    vec![(base_key_child, test_img("metadata key 2"))],
+                )], // image layers
+                Lsn(0x30),
+            )
             .await
             .unwrap();
 
-        lsn.0 += 0x10;
-
-        {
-            let mut writer = child.writer().await;
-            writer
-                .put(
-                    base_key_child,
-                    lsn,
-                    &Value::Image(test_img("metadata key 2")),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            drop(writer);
-
-            child.freeze_and_flush().await?;
-
-            child
-                .compact(
-                    &cancel,
-                    {
-                        let mut set = EnumSet::empty();
-                        set.insert(CompactFlags::ForceImageLayerCreation);
-                        set.insert(CompactFlags::ForceRepartition);
-                        set
-                    },
-                    &ctx,
-                )
-                .await?; // force create an image layer for metadata keys
-            tenant
-                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
-
         async fn get_vectored_impl_wrapper(
             tline: &Arc<Timeline>,
             key: Key,
@@ -6442,6 +6419,8 @@ mod tests {
             }))
         }
 
+        let lsn = Lsn(0x30);
+
         // test vectored get on parent timeline
         assert_eq!(
             get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b49887646549..8033edaa12a3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5371,6 +5371,102 @@ impl Timeline {
             shard_count: self.tenant_shard_id.shard_count,
         }
     }
+
+    #[cfg(test)]
+    pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
+        self.last_record_lsn.advance(new_lsn);
+    }
+
+    /// Force create an image layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_image_layer(
+        self: &Arc<Timeline>,
+        lsn: Lsn,
+        mut images: Vec<(Key, Bytes)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        assert!(
+            lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}"
+        );
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(lsn >= check_start_lsn);
+        }
+        images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
+        let min_key = *images.first().map(|(k, _)| k).unwrap();
+        let max_key = images.last().map(|(k, _)| k).unwrap().next();
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(min_key..max_key),
+            lsn,
+            ctx,
+        )
+        .await?;
+        for (key, img) in images {
+            image_layer_writer.put_image(key, img, ctx).await?;
+        }
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(image_layer);
+        }
+
+        Ok(())
+    }
+
+    /// Force create a delta layer and place it into the layer map.
+    ///
+    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
+    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run.
+    #[cfg(test)]
+    pub(super) async fn force_create_delta_layer(
+        self: &Arc<Timeline>,
+        mut deltas: Vec<(Key, Lsn, Value)>,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let last_record_lsn = self.get_last_record_lsn();
+        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
+        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
+        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let max_lsn = Lsn(deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap().0 + 1);
+        assert!(
+            max_lsn <= last_record_lsn,
+            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+        );
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(min_lsn >= check_start_lsn);
+        }
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            min_key,
+            min_lsn..max_lsn,
+            ctx,
+        )
+        .await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
+
+        {
+            let mut guard = self.layers.write().await;
+            guard.force_insert_layer(delta_layer);
+        }
+
+        Ok(())
+    }
 }
 
 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 884b71df7521..b78c98a506d8 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -255,6 +255,13 @@ impl LayerManager {
         updates.flush()
     }
 
+    #[cfg(test)]
+    pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
+        let mut updates = self.layer_map.batch_update();
+        Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+        updates.flush()
+    }
+
     /// Helper function to insert a layer into the layer map and file manager.
     fn insert_historic_layer(
         layer: Layer,

From 9fda85b4862bccf7e57c1f2fadfd03f1c3c7288b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 17:02:10 +0100
Subject: [PATCH 26/31] pageserver: remove AncestorStopping error variants
 (#7916)

## Problem

In all cases, AncestorStopping is equivalent to Cancelled.

This became more obvious in
https://github.com/neondatabase/neon/pull/7912#discussion_r1620582309
when updating these error types.

## Summary of changes

- Remove AncestorStopping, always use Cancelled instead
---
 pageserver/src/consumption_metrics.rs         |  2 +-
 pageserver/src/http/routes.rs                 |  3 ---
 pageserver/src/pgdatadir_mapping.rs           |  7 ++-----
 pageserver/src/tenant/timeline.rs             | 19 ++++---------------
 .../fixtures/pageserver/allowed_errors.py     |  2 +-
 5 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 62bbde42f45e..540d0d2e8c2c 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -358,7 +358,7 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
     // mean the synthetic size worker should terminate.
     let shutting_down = matches!(
         e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+        Some(PageReconstructError::Cancelled)
     );
 
     if !shutting_down {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 913d45d63c6d..bd6fa028ac59 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -184,9 +184,6 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
-            PageReconstructError::AncestorStopping(_) => {
-                ApiError::ResourceUnavailable(format!("{pre}").into())
-            }
             PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0fc846e5f391..c78c3588554a 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -115,9 +115,7 @@ impl From<PageReconstructError> for CollectKeySpaceError {
 impl From<PageReconstructError> for CalculateLogicalSizeError {
     fn from(pre: PageReconstructError) -> Self {
         match pre {
-            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
-                Self::Cancelled
-            }
+            PageReconstructError::Cancelled => Self::Cancelled,
             _ => Self::PageRead(pre),
         }
     }
@@ -1614,8 +1612,7 @@ impl<'a> DatadirModification<'a> {
                         aux_files.dir = Some(dir);
                     }
                     Err(
-                        e @ (PageReconstructError::AncestorStopping(_)
-                        | PageReconstructError::Cancelled
+                        e @ (PageReconstructError::Cancelled
                         | PageReconstructError::AncestorLsnTimeout(_)),
                     ) => {
                         // Important that we do not interpret a shutdown error as "not found" and thereby
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8033edaa12a3..4a9d981ad8ae 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -501,10 +501,6 @@ pub(crate) enum PageReconstructError {
     #[error("timeline shutting down")]
     Cancelled,
 
-    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(anyhow::Error),
@@ -569,7 +565,7 @@ impl PageReconstructError {
         match self {
             Other(_) => false,
             AncestorLsnTimeout(_) => false,
-            Cancelled | AncestorStopping(_) => true,
+            Cancelled => true,
             WalRedo(_) => false,
             MissingKey { .. } => false,
         }
@@ -645,9 +641,6 @@ pub(crate) enum GetVectoredError {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum GetReadyAncestorError {
-    #[error("ancestor timeline {0} is being stopped")]
-    AncestorStopping(TimelineId),
-
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
@@ -757,7 +750,6 @@ impl From<GetReadyAncestorError> for PageReconstructError {
     fn from(e: GetReadyAncestorError) -> Self {
         use GetReadyAncestorError::*;
         match e {
-            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
             AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
             bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
             Cancelled => PageReconstructError::Cancelled,
@@ -1192,9 +1184,7 @@ impl Timeline {
 
                 use PageReconstructError::*;
                 match block {
-                    Err(Cancelled | AncestorStopping(_)) => {
-                        return Err(GetVectoredError::Cancelled)
-                    }
+                    Err(Cancelled) => return Err(GetVectoredError::Cancelled),
                     Err(MissingKey(_))
                         if NON_INHERITED_RANGE.contains(&key)
                             || NON_INHERITED_SPARSE_RANGE.contains(&key) =>
@@ -3585,9 +3575,8 @@ impl Timeline {
         match ancestor.wait_to_become_active(ctx).await {
             Ok(()) => {}
             Err(TimelineState::Stopping) => {
-                return Err(GetReadyAncestorError::AncestorStopping(
-                    ancestor.timeline_id,
-                ));
+                // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping.
+                return Err(GetReadyAncestorError::Cancelled);
             }
             Err(state) => {
                 return Err(GetReadyAncestorError::BadState {
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index ad8bbe20219a..ef412cade710 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -66,7 +66,7 @@ def scan_pageserver_log_for_errors(
     ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
     ".*task iteration took longer than the configured period.*",
     # these can happen anytime we do compactions from background task and shutdown pageserver
-    r".*ERROR.*ancestor timeline \S+ is being stopped",
+    ".*could not compact.*cancelled.*",
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",

From ef83f31e77abf7cf55387635eb3e8ad2191d97a1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 31 May 2024 21:19:41 +0300
Subject: [PATCH 27/31] pagectl: key command for dumping what we know about the
 key (#7890)

What we know about the key via added `pagectl key $key` command:
- debug formatting
- shard placement when `--shard-count` is specified
- different boolean queries in `key.rs`
- aux files v2

Example:

```
$ cargo run -qp pagectl -- key 000000063F00004005000060270000100E2C
parsed from hex: 000000063F00004005000060270000100E2C:

Key { field1: 0, field2: 1599, field3: 16389, field4: 24615, field5: 0, field6: 1052204 }
rel_block:         true
rel_vm_block:      false
rel_fsm_block:     false
slru_block:        false
inherited:         true
rel_size:          false
slru_segment_size: false
recognized kind:   None
```
---
 Cargo.lock                        |   1 +
 libs/pageserver_api/src/key.rs    |  36 ++-
 libs/pageserver_api/src/reltag.rs |  53 +++-
 libs/pageserver_api/src/shard.rs  |  25 ++
 libs/utils/src/hex.rs             |  19 +-
 pageserver/ctl/Cargo.toml         |   1 +
 pageserver/ctl/src/key.rs         | 477 ++++++++++++++++++++++++++++++
 pageserver/ctl/src/main.rs        |   4 +
 8 files changed, 608 insertions(+), 8 deletions(-)
 create mode 100644 pageserver/ctl/src/key.rs

diff --git a/Cargo.lock b/Cargo.lock
index 96ba5c8ec38b..6a601044728d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3570,6 +3570,7 @@ dependencies = [
  "serde",
  "serde_json",
  "svg_fmt",
+ "thiserror",
  "tokio",
  "tokio-util",
  "toml_edit",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index b00d48498ca5..e52d4ef98611 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -381,10 +381,15 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
         field3: rel.dbnode,
         field4: rel.relnode,
         field5: rel.forknum,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
     }
 }
 
+#[inline(always)]
+pub fn is_rel_size_key(key: &Key) -> bool {
+    key.field1 == 0 && key.field6 == u32::MAX
+}
+
 #[inline(always)]
 pub fn rel_key_range(rel: RelTag) -> Range<Key> {
     Key {
@@ -422,6 +427,25 @@ pub fn slru_dir_to_key(kind: SlruKind) -> Key {
     }
 }
 
+#[inline(always)]
+pub fn slru_dir_kind(key: &Key) -> Option<Result<SlruKind, u32>> {
+    if key.field1 == 0x01
+        && key.field3 == 0
+        && key.field4 == 0
+        && key.field5 == 0
+        && key.field6 == 0
+    {
+        match key.field2 {
+            0 => Some(Ok(SlruKind::Clog)),
+            1 => Some(Ok(SlruKind::MultiXactMembers)),
+            2 => Some(Ok(SlruKind::MultiXactOffsets)),
+            x => Some(Err(x)),
+        }
+    } else {
+        None
+    }
+}
+
 #[inline(always)]
 pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
     Key {
@@ -450,10 +474,18 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
         field3: 1,
         field4: segno,
         field5: 0,
-        field6: 0xffffffff,
+        field6: 0xffff_ffff,
     }
 }
 
+pub fn is_slru_segment_size_key(key: &Key) -> bool {
+    key.field1 == 0x01
+        && key.field2 < 0x03
+        && key.field3 == 0x01
+        && key.field5 == 0
+        && key.field6 == u32::MAX
+}
+
 #[inline(always)]
 pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
     let field2 = match kind {
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 38693ab84787..010a9c2932d6 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
 use std::fmt;
 
 use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::forknumber_to_name;
+use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM};
 use postgres_ffi::Oid;
 
 ///
@@ -68,6 +68,57 @@ impl fmt::Display for RelTag {
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum ParseRelTagError {
+    #[error("invalid forknum")]
+    InvalidForknum(#[source] std::num::ParseIntError),
+    #[error("missing triplet member {}", .0)]
+    MissingTripletMember(usize),
+    #[error("invalid triplet member {}", .0)]
+    InvalidTripletMember(usize, #[source] std::num::ParseIntError),
+}
+
+impl std::str::FromStr for RelTag {
+    type Err = ParseRelTagError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use ParseRelTagError::*;
+
+        // FIXME: in postgres logs this separator is dot
+        // Example:
+        //     could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0
+        // with a regex we could get this more painlessly
+        let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) {
+            Some((t, f)) => {
+                let forknum = forkname_to_number(Some(f));
+                let forknum = if let Ok(f) = forknum {
+                    f
+                } else {
+                    f.parse::<u8>().map_err(InvalidForknum)?
+                };
+
+                (t, Some(forknum))
+            }
+            None => (s, None),
+        };
+
+        let mut split = triplet
+            .splitn(3, '/')
+            .enumerate()
+            .map(|(i, s)| s.parse::<u32>().map_err(|e| InvalidTripletMember(i, e)));
+        let spcnode = split.next().ok_or(MissingTripletMember(0))??;
+        let dbnode = split.next().ok_or(MissingTripletMember(1))??;
+        let relnode = split.next().ok_or(MissingTripletMember(2))??;
+
+        Ok(RelTag {
+            spcnode,
+            forknum: forknum.unwrap_or(MAIN_FORKNUM),
+            dbnode,
+            relnode,
+        })
+    }
+}
+
 impl RelTag {
     pub fn to_segfile_name(&self, segno: u32) -> String {
         let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 1c05a01926f0..8ace426f8801 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -428,6 +428,12 @@ impl<'de> Deserialize<'de> for TenantShardId {
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
 
+impl Default for ShardStripeSize {
+    fn default() -> Self {
+        DEFAULT_STRIPE_SIZE
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
@@ -713,6 +719,25 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
     ShardNumber((hash % count.0 as u32) as u8)
 }
 
+/// For debugging, while not exposing the internals.
+#[derive(Debug)]
+#[allow(unused)] // used by debug formatting by pagectl
+struct KeyShardingInfo {
+    shard0: bool,
+    shard_number: ShardNumber,
+}
+
+pub fn describe(
+    key: &Key,
+    shard_count: ShardCount,
+    stripe_size: ShardStripeSize,
+) -> impl std::fmt::Debug {
+    KeyShardingInfo {
+        shard0: key_is_shard0(key),
+        shard_number: key_to_shard_number(shard_count, stripe_size, key),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use utils::Hex;
diff --git a/libs/utils/src/hex.rs b/libs/utils/src/hex.rs
index fc0bb7e4a280..382f805a967d 100644
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -19,13 +19,13 @@
 /// // right: [0x68; 1]
 /// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
 /// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
+pub struct Hex<S>(pub S);
 
-impl std::fmt::Debug for Hex<'_> {
+impl<S: AsRef<[u8]>> std::fmt::Debug for Hex<S> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
+        let chunks = self.0.as_ref().chunks(16);
+        for (i, c) in chunks.enumerate() {
             if i > 0 && !c.is_empty() {
                 writeln!(f, ", ")?;
             }
@@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> {
                 write!(f, "0x{b:02x}")?;
             }
         }
-        write!(f, "; {}]", self.0.len())
+        write!(f, "; {}]", self.0.as_ref().len())
+    }
+}
+
+impl<R: AsRef<[u8]>, L: AsRef<[u8]>> PartialEq<Hex<R>> for Hex<L> {
+    fn eq(&self, other: &Hex<R>) -> bool {
+        let left = self.0.as_ref();
+        let right = other.0.as_ref();
+
+        left == right
     }
 }
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 843f5dd862ab..be5626040b96 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 toml_edit.workspace = true
diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs
new file mode 100644
index 000000000000..28448811f8b1
--- /dev/null
+++ b/pageserver/ctl/src/key.rs
@@ -0,0 +1,477 @@
+use anyhow::Context;
+use clap::Parser;
+use pageserver_api::{
+    key::Key,
+    reltag::{BlockNumber, RelTag, SlruKind},
+    shard::{ShardCount, ShardStripeSize},
+};
+use std::str::FromStr;
+
+#[derive(Parser)]
+pub(super) struct DescribeKeyCommand {
+    /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum
+    input: Vec<String>,
+
+    /// The number of shards to calculate what Keys placement would be.
+    #[arg(long)]
+    shard_count: Option<CustomShardCount>,
+
+    /// The sharding stripe size.
+    ///
+    /// The default is hardcoded. It makes no sense to provide this without providing
+    /// `--shard-count`.
+    #[arg(long, requires = "shard_count")]
+    stripe_size: Option<u32>,
+}
+
+/// Sharded shard count without unsharded count, which the actual ShardCount supports.
+#[derive(Clone, Copy)]
+pub(super) struct CustomShardCount(std::num::NonZeroU8);
+
+#[derive(Debug, thiserror::Error)]
+pub(super) enum InvalidShardCount {
+    #[error(transparent)]
+    ParsingFailed(#[from] std::num::ParseIntError),
+    #[error("too few shards")]
+    TooFewShards,
+}
+
+impl FromStr for CustomShardCount {
+    type Err = InvalidShardCount;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let inner: std::num::NonZeroU8 = s.parse()?;
+        if inner.get() < 2 {
+            Err(InvalidShardCount::TooFewShards)
+        } else {
+            Ok(CustomShardCount(inner))
+        }
+    }
+}
+
+impl From<CustomShardCount> for ShardCount {
+    fn from(value: CustomShardCount) -> Self {
+        ShardCount::new(value.0.get())
+    }
+}
+
+impl DescribeKeyCommand {
+    pub(super) fn execute(self) {
+        let DescribeKeyCommand {
+            input,
+            shard_count,
+            stripe_size,
+        } = self;
+
+        let material = KeyMaterial::try_from(input.as_slice()).unwrap();
+        let kind = material.kind();
+        let key = Key::from(material);
+
+        println!("parsed from {kind}: {key}:");
+        println!();
+        println!("{key:?}");
+
+        macro_rules! kind_query {
+            ($name:ident) => {{
+                let s: &'static str = stringify!($name);
+                let s = s.strip_prefix("is_").unwrap_or(s);
+                let s = s.strip_suffix("_key").unwrap_or(s);
+
+                #[allow(clippy::needless_borrow)]
+                (s, pageserver_api::key::$name(key))
+            }};
+        }
+
+        // the current characterization is a mess of these boolean queries and separate
+        // "recognization". I think it accurately represents how strictly we model the Key
+        // right now, but could of course be made less confusing.
+
+        let queries = [
+            ("rel_block", pageserver_api::key::is_rel_block_key(&key)),
+            kind_query!(is_rel_vm_block_key),
+            kind_query!(is_rel_fsm_block_key),
+            kind_query!(is_slru_block_key),
+            kind_query!(is_inherited_key),
+            ("rel_size", pageserver_api::key::is_rel_size_key(&key)),
+            (
+                "slru_segment_size",
+                pageserver_api::key::is_slru_segment_size_key(&key),
+            ),
+        ];
+
+        let recognized_kind = "recognized kind";
+        let metadata_key = "metadata key";
+        let shard_placement = "shard placement";
+
+        let longest = queries
+            .iter()
+            .map(|t| t.0)
+            .chain([recognized_kind, metadata_key, shard_placement])
+            .map(|s| s.len())
+            .max()
+            .unwrap();
+
+        let colon = 1;
+        let padding = 1;
+
+        for (name, is) in queries {
+            let width = longest - name.len() + colon + padding;
+            println!("{}{:width$}{}", name, ":", is);
+        }
+
+        let width = longest - recognized_kind.len() + colon + padding;
+        println!(
+            "{}{:width$}{:?}",
+            recognized_kind,
+            ":",
+            RecognizedKeyKind::new(key),
+        );
+
+        if let Some(shard_count) = shard_count {
+            // seeing the sharding placement might be confusing, so leave it out unless shard
+            // count was given.
+
+            let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default();
+            println!(
+                "# placement with shard_count: {} and stripe_size: {}:",
+                shard_count.0, stripe_size.0
+            );
+            let width = longest - shard_placement.len() + colon + padding;
+            println!(
+                "{}{:width$}{:?}",
+                shard_placement,
+                ":",
+                pageserver_api::shard::describe(&key, shard_count.into(), stripe_size)
+            );
+        }
+    }
+}
+
+/// Hand-wavy "inputs we accept" for a key.
+#[derive(Debug)]
+pub(super) enum KeyMaterial {
+    Hex(Key),
+    String(SpanAttributesFromLogs),
+    Split(RelTag, BlockNumber),
+}
+
+impl KeyMaterial {
+    fn kind(&self) -> &'static str {
+        match self {
+            KeyMaterial::Hex(_) => "hex",
+            KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split",
+        }
+    }
+}
+
+impl From<KeyMaterial> for Key {
+    fn from(value: KeyMaterial) -> Self {
+        match value {
+            KeyMaterial::Hex(key) => key,
+            KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum))
+            | KeyMaterial::Split(rt, blocknum) => {
+                pageserver_api::key::rel_block_to_key(rt, blocknum)
+            }
+        }
+    }
+}
+
+impl<S: AsRef<str>> TryFrom<&[S]> for KeyMaterial {
+    type Error = anyhow::Error;
+
+    fn try_from(value: &[S]) -> Result<Self, Self::Error> {
+        match value {
+            [] => anyhow::bail!(
+                "need 1..N positional arguments describing the key, try hex or a log line"
+            ),
+            [one] => {
+                let one = one.as_ref();
+
+                let key = Key::from_hex(one).map(KeyMaterial::Hex);
+
+                let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String);
+
+                match (key, attrs) {
+                    (Ok(key), _) => Ok(key),
+                    (_, Ok(s)) => Ok(s),
+                    (Err(e1), Err(e2)) => anyhow::bail!(
+                        "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}"
+                    ),
+                }
+            }
+            more => {
+                // assume going left to right one of these is a reltag and then we find a blocknum
+                // this works, because we don't have plain numbers at least right after reltag in
+                // logs. for some definition of "works".
+
+                let Some((reltag_at, reltag)) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .enumerate()
+                    .find_map(|(i, s)| {
+                        s.split_once("rel=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<RelTag>()
+                            .ok()
+                            .map(|rt| (i, rt))
+                    })
+                else {
+                    anyhow::bail!("found no RelTag in arguments");
+                };
+
+                let Some(blocknum) = more
+                    .iter()
+                    .map(AsRef::as_ref)
+                    .skip(reltag_at)
+                    .find_map(|s| {
+                        s.split_once("blkno=")
+                            .map(|(_garbage, actual)| actual)
+                            .unwrap_or(s)
+                            .parse::<BlockNumber>()
+                            .ok()
+                    })
+                else {
+                    anyhow::bail!("found no blocknum in arguments");
+                };
+
+                Ok(KeyMaterial::Split(reltag, blocknum))
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber);
+
+impl std::str::FromStr for SpanAttributesFromLogs {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // accept the span separator but do not require or fail if either is missing
+        // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}"
+        let (_, reltag) = s
+            .split_once("rel=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?;
+        let reltag = reltag.split_whitespace().next().unwrap();
+
+        let (_, blocknum) = s
+            .split_once("blkno=")
+            .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?;
+        let blocknum = blocknum.split_whitespace().next().unwrap();
+
+        let reltag = reltag
+            .parse()
+            .with_context(|| format!("parse reltag from {reltag:?}"))?;
+        let blocknum = blocknum
+            .parse()
+            .with_context(|| format!("parse blocknum from {blocknum:?}"))?;
+
+        Ok(Self(reltag, blocknum))
+    }
+}
+
+#[derive(Debug)]
+#[allow(dead_code)] // debug print is used
+enum RecognizedKeyKind {
+    DbDir,
+    ControlFile,
+    Checkpoint,
+    AuxFilesV1,
+    SlruDir(Result<SlruKind, u32>),
+    RelMap(RelTagish<2>),
+    RelDir(RelTagish<2>),
+    AuxFileV2(Result<AuxFileV2, utils::Hex<[u8; 16]>>),
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(unused)]
+enum AuxFileV2 {
+    Recognized(&'static str, utils::Hex<[u8; 13]>),
+    OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>),
+    Other(utils::Hex<[u8; 13]>),
+}
+
+impl RecognizedKeyKind {
+    fn new(key: Key) -> Option<Self> {
+        use RecognizedKeyKind::{
+            AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir,
+        };
+
+        let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key);
+
+        Some(match key {
+            pageserver_api::key::DBDIR_KEY => DbDir,
+            pageserver_api::key::CONTROLFILE_KEY => ControlFile,
+            pageserver_api::key::CHECKPOINT_KEY => Checkpoint,
+            pageserver_api::key::AUX_FILES_KEY => AuxFilesV1,
+            _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()),
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => {
+                RelMap([key.field2, key.field3].into())
+            }
+            _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => {
+                RelDir([key.field2, key.field3].into())
+            }
+            _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2(
+                AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())),
+            ),
+            _ => return None,
+        })
+    }
+}
+
+impl AuxFileV2 {
+    fn new(key: Key) -> Option<AuxFileV2> {
+        const EMPTY_HASH: [u8; 13] = {
+            let mut out = [0u8; 13];
+            let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes();
+            let mut i = 3;
+            while i < 16 {
+                out[i - 3] = hash[i];
+                i += 1;
+            }
+            out
+        };
+
+        let bytes = key.to_i128().to_be_bytes();
+        let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap());
+
+        assert_eq!(EMPTY_HASH.len(), hash.0.len());
+
+        // TODO: we could probably find the preimages for the hashes
+
+        Some(match (bytes[1], bytes[2]) {
+            (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash),
+            (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash),
+            (1, 3) if hash.0 == EMPTY_HASH => {
+                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
+            }
+            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
+            (0xff, 0xff) => AuxFileV2::Other(hash),
+            _ => return None,
+        })
+    }
+}
+
+/// Prefix of RelTag, currently only known use cases are the two item versions.
+///
+/// Renders like a reltag with `/`, nothing else.
+struct RelTagish<const N: usize>([u32; N]);
+
+impl<const N: usize> From<[u32; N]> for RelTagish<N> {
+    fn from(val: [u32; N]) -> Self {
+        RelTagish(val)
+    }
+}
+
+impl<const N: usize> std::fmt::Debug for RelTagish<N> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use std::fmt::Write as _;
+        let mut first = true;
+        self.0.iter().try_for_each(|x| {
+            if !first {
+                f.write_char('/')?;
+            }
+            first = false;
+            write!(f, "{}", x)
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use pageserver::aux_file::encode_aux_file_key;
+
+    use super::*;
+
+    #[test]
+    fn hex_is_key_material() {
+        let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap();
+        assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}");
+    }
+
+    #[test]
+    fn single_positional_spanalike_is_key_material() {
+        // why is this needed? if you are checking many, then copypaste starts to appeal
+        let strings = [
+            (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"),
+            (line!(), "rel=1663/208101/2620_fsm blkno=2"),
+            (line!(), "rel=1663/208101/2620.1 blkno=2"),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(&[example][..])
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+
+        // not supporting this is rather accidential, but I think the input parsing is lenient
+        // enough already
+        KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err();
+    }
+
+    #[test]
+    fn multiple_spanlike_args() {
+        let strings = [
+            (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]),
+            (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]),
+            (line!(), &["1663/208101/2620_fsm", "2"][..]),
+        ];
+
+        let mut first: Option<Key> = None;
+
+        for (line, example) in strings {
+            let m = KeyMaterial::try_from(example)
+                .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}"));
+            let key = Key::from(m);
+            if let Some(first) = first {
+                assert_eq!(first, key);
+            } else {
+                first = Some(key);
+            }
+        }
+    }
+    #[test]
+    fn recognized_auxfiles() {
+        use AuxFileV2::*;
+
+        let empty = [
+            0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d,
+        ];
+        let foobar = [
+            0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18,
+        ];
+
+        #[rustfmt::skip]
+        let examples = [
+            (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))),
+            (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))),
+            (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))),
+            (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))),
+            (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))),
+            (line!(), "foobar", Other(utils::Hex(foobar))),
+        ];
+
+        for (line, path, expected) in examples {
+            let key = encode_aux_file_key(path);
+            let recognized =
+                AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed"));
+
+            assert_eq!(recognized, expected);
+        }
+
+        assert_eq!(
+            AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()),
+            None,
+            "example key has one too few 0 after 6 before 1"
+        );
+    }
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index e92c352dab1d..50c3ac4c6143 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -6,6 +6,7 @@
 
 mod draw_timeline_dir;
 mod index_part;
+mod key;
 mod layer_map_analyzer;
 mod layers;
 
@@ -61,6 +62,8 @@ enum Commands {
     AnalyzeLayerMap(AnalyzeLayerMapCmd),
     #[command(subcommand)]
     Layer(LayerCmd),
+    /// Debug print a hex key found from logs
+    Key(key::DescribeKeyCommand),
 }
 
 /// Read and update pageserver metadata file
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
                 .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
                 .await?;
         }
+        Commands::Key(dkc) => dkc.execute(),
     };
     Ok(())
 }

From 7e60563910936cf6643edb686a8163b0b03c7108 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 31 May 2024 22:20:06 +0100
Subject: [PATCH 28/31] pageserver: add GcError type (#7917)

## Problem

- Because GC exposes all errors as an anyhow::Error, we have
intermittent issues with spurious log errors during shutdown, e.g. in
this failure of a performance test
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9300804302/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/214a2154f6f0217a/

```
Gc failed 1 times, retrying in 2s: shutting down
```

GC really doesn't do a lot of complicated IO: it doesn't benefit from
the backtrace capabilities of anyhow::Error, and can be expressed more
robustly as an enum.

## Summary of changes

- Add GcError type and use it instead of anyhow::Error in GC functions
- In `gc_iteration_internal`, return GcError::Cancelled on shutdown
rather than Ok(()) (we only used Ok before because we didn't have a
clear cancellation error variant to use).
- In `gc_iteration_internal`, skip past timelines that are shutting
down, to avoid having to go through another GC iteration if we happen to
see a deleting timeline during a GC run.
- In `refresh_gc_info_internal`, avoid an error case where a timeline
might not be found after being looked up, by carrying an Arc<Timeline>
instead of a TimelineId between the first loop and second loop in the
function.
- In HTTP request handler, handle Cancelled variants as 503 instead of
turning all GC errors into 500s.
---
 pageserver/src/tenant.rs                  | 112 +++++++++++++---------
 pageserver/src/tenant/mgr.rs              |  10 +-
 pageserver/src/tenant/tasks.rs            |  33 ++++---
 pageserver/src/tenant/timeline.rs         |  47 ++++++---
 test_runner/regress/test_tenant_detach.py |   4 +-
 5 files changed, 129 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cfa683beb860..eff9c742c1fd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -487,6 +487,33 @@ enum CreateTimelineCause {
     Delete,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GcError {
+    // The tenant is shutting down
+    #[error("tenant shutting down")]
+    TenantCancelled,
+
+    // The tenant is shutting down
+    #[error("timeline shutting down")]
+    TimelineCancelled,
+
+    // The tenant is in a state inelegible to run GC
+    #[error("not active")]
+    NotActive,
+
+    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
+    #[error("not active")]
+    BadLsn { why: String },
+
+    // A remote storage error while scheduling updates after compaction
+    #[error(transparent)]
+    Remote(anyhow::Error),
+
+    // If GC was invoked for a particular timeline, this error means it didn't exist
+    #[error("timeline not found")]
+    TimelineNotFound,
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -1605,24 +1632,23 @@ impl Tenant {
     /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
     /// requires more history to be retained.
     //
-    pub async fn gc_iteration(
+    pub(crate) async fn gc_iteration(
         &self,
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         // Don't start doing work during shutdown
         if let TenantState::Stopping { .. } = self.current_state() {
             return Ok(GcResult::default());
         }
 
         // there is a global allowed_error for this
-        anyhow::ensure!(
-            self.is_active(),
-            "Cannot run GC iteration on inactive tenant"
-        );
+        if !self.is_active() {
+            return Err(GcError::NotActive);
+        }
 
         {
             let conf = self.tenant_conf.load();
@@ -2790,28 +2816,13 @@ impl Tenant {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
-        let gc_timelines = match self
+        let gc_timelines = self
             .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
-            .await
-        {
-            Ok(result) => result,
-            Err(e) => {
-                if let Some(PageReconstructError::Cancelled) =
-                    e.downcast_ref::<PageReconstructError>()
-                {
-                    // Handle cancellation
-                    totals.elapsed = now.elapsed();
-                    return Ok(totals);
-                } else {
-                    // Propagate other errors
-                    return Err(e);
-                }
-            }
-        };
+            .await?;
 
         failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
@@ -2836,7 +2847,19 @@ impl Tenant {
                 // made.
                 break;
             }
-            let result = timeline.gc().await?;
+            let result = match timeline.gc().await {
+                Err(GcError::TimelineCancelled) => {
+                    if target_timeline_id.is_some() {
+                        // If we were targetting this specific timeline, surface cancellation to caller
+                        return Err(GcError::TimelineCancelled);
+                    } else {
+                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
+                        // skip past this and proceed to try GC on other timelines.
+                        continue;
+                    }
+                }
+                r => r?,
+            };
             totals += result;
         }
 
@@ -2849,11 +2872,11 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(
+    pub(crate) async fn refresh_gc_info(
         &self,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -2874,7 +2897,7 @@ impl Tenant {
         pitr: Duration,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    ) -> Result<Vec<Arc<Timeline>>, GcError> {
         // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
         // currently visible timelines.
         let timelines = self
@@ -2911,8 +2934,8 @@ impl Tenant {
             }
         }
 
-        if !self.is_active() {
-            anyhow::bail!("shutting down");
+        if !self.is_active() || self.cancel.is_cancelled() {
+            return Err(GcError::TenantCancelled);
         }
 
         // grab mutex to prevent new timelines from being created here; avoid doing long operations
@@ -2921,19 +2944,19 @@ impl Tenant {
 
         // Scan all timelines. For each timeline, remember the timeline ID and
         // the branch point where it was created.
-        let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
+        let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
             let timelines = self.timelines.lock().unwrap();
             let mut all_branchpoints = BTreeSet::new();
-            let timeline_ids = {
+            let timelines = {
                 if let Some(target_timeline_id) = target_timeline_id.as_ref() {
                     if timelines.get(target_timeline_id).is_none() {
-                        bail!("gc target timeline does not exist")
+                        return Err(GcError::TimelineNotFound);
                     }
                 };
 
                 timelines
                     .iter()
-                    .map(|(timeline_id, timeline_entry)| {
+                    .map(|(_timeline_id, timeline_entry)| {
                         if let Some(ancestor_timeline_id) =
                             &timeline_entry.get_ancestor_timeline_id()
                         {
@@ -2955,33 +2978,28 @@ impl Tenant {
                             }
                         }
 
-                        *timeline_id
+                        timeline_entry.clone()
                     })
                     .collect::<Vec<_>>()
             };
-            (all_branchpoints, timeline_ids)
+            (all_branchpoints, timelines)
         };
 
         // Ok, we now know all the branch points.
         // Update the GC information for each timeline.
-        let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
-        for timeline_id in timeline_ids {
-            // Timeline is known to be local and loaded.
-            let timeline = self
-                .get_timeline(timeline_id, false)
-                .with_context(|| format!("Timeline {timeline_id} was not found"))?;
-
+        let mut gc_timelines = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
             // If target_timeline is specified, ignore all other timelines
             if let Some(target_timeline_id) = target_timeline_id {
-                if timeline_id != target_timeline_id {
+                if timeline.timeline_id != target_timeline_id {
                     continue;
                 }
             }
 
             let branchpoints: Vec<Lsn> = all_branchpoints
                 .range((
-                    Included((timeline_id, Lsn(0))),
-                    Included((timeline_id, Lsn(u64::MAX))),
+                    Included((timeline.timeline_id, Lsn(0))),
+                    Included((timeline.timeline_id, Lsn(u64::MAX))),
                 ))
                 .map(|&x| x.1)
                 .collect();
@@ -2989,7 +3007,7 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
-                match gc_cutoffs.remove(&timeline_id) {
+                match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         *target = GcInfo {
                             retain_lsns: branchpoints,
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 89fdf31849eb..0bb1d750aa2d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
@@ -2833,7 +2833,13 @@ pub(crate) async fn immediate_gc(
         }
     }
 
-    result.map_err(ApiError::InternalServerError)
+    result.map_err(|e| match e {
+        GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
+        GcError::TimelineNotFound => {
+            ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
+        }
+        other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+    })
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bf2d8a47b4ca..a6dfa84f3598 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 let res = tenant
                     .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                     .await;
-                if let Err(e) = res {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count + 1,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
+                match res {
+                    Ok(_) => {
+                        error_run_count = 0;
+                        period
+                    }
+                    Err(crate::tenant::GcError::TenantCancelled) => {
+                        return;
+                    }
+                    Err(e) => {
+                        let wait_duration = backoff::exponential_backoff_duration_seconds(
+                            error_run_count + 1,
+                            1.0,
+                            MAX_BACKOFF_SECS,
+                        );
+                        error_run_count += 1;
+                        let wait_duration = Duration::from_secs_f64(wait_duration);
+
+                        error!(
                         "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                     );
-                    wait_duration
-                } else {
-                    error_run_count = 0;
-                    period
+                        wait_duration
+                    }
                 }
             };
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a9d981ad8ae..9bf429972d89 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -131,11 +131,14 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
-use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
 use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
+use super::{
+    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
+    GcError,
+};
 
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(crate) enum FlushLoopState {
@@ -4837,7 +4840,7 @@ impl Timeline {
     /// Currently, we don't make any attempt at removing unneeded page versions
     /// within a layer file. We can only remove the whole file if it's fully
     /// obsolete.
-    pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+    pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
         // this is most likely the background tasks, but it might be the spawned task from
         // immediate_gc
         let _g = tokio::select! {
@@ -4850,7 +4853,7 @@ impl Timeline {
 
         // Is the timeline being deleted?
         if self.is_stopping() {
-            anyhow::bail!("timeline is Stopping");
+            return Err(GcError::TimelineCancelled);
         }
 
         let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
@@ -4908,7 +4911,7 @@ impl Timeline {
         pitr_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
         new_gc_cutoff: Lsn,
-    ) -> anyhow::Result<GcResult> {
+    ) -> Result<GcResult, GcError> {
         // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
 
         let now = SystemTime::now();
@@ -4930,12 +4933,15 @@ impl Timeline {
         // The GC cutoff should only ever move forwards.
         let waitlist = {
             let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
-            ensure!(
-                *write_guard <= new_gc_cutoff,
-                "Cannot move GC cutoff LSN backwards (was {}, new {})",
-                *write_guard,
-                new_gc_cutoff
-            );
+            if *write_guard > new_gc_cutoff {
+                return Err(GcError::BadLsn {
+                    why: format!(
+                        "Cannot move GC cutoff LSN backwards (was {}, new {})",
+                        *write_guard, new_gc_cutoff
+                    ),
+                });
+            }
+
             write_guard.store_and_unlock(new_gc_cutoff)
         };
         waitlist.wait().await;
@@ -5044,7 +5050,14 @@ impl Timeline {
             // This unconditionally schedules also an index_part.json update, even though, we will
             // be doing one a bit later with the unlinked gc'd layers.
             let disk_consistent_lsn = self.disk_consistent_lsn.load();
-            self.schedule_uploads(disk_consistent_lsn, None)?;
+            self.schedule_uploads(disk_consistent_lsn, None)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;
 
             let gc_layers = layers_to_remove
                 .iter()
@@ -5053,7 +5066,15 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            self.remote_client.schedule_gc_update(&gc_layers)?;
+            self.remote_client
+                .schedule_gc_update(&gc_layers)
+                .map_err(|e| {
+                    if self.cancel.is_cancelled() {
+                        GcError::TimelineCancelled
+                    } else {
+                        GcError::Remote(e)
+                    }
+                })?;
 
             guard.finish_gc_timeline(&gc_layers);
 
@@ -5068,7 +5089,7 @@ impl Timeline {
             result.layers_removed, new_gc_cutoff
         );
 
-        result.elapsed = now.elapsed()?;
+        result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
         Ok(result)
     }
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 12a4730e691c..871351b2d54b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
 
     # gc should not try to even start on a timeline that doesn't exist
     with pytest.raises(
-        expected_exception=PageserverApiException, match="gc target timeline does not exist"
+        expected_exception=PageserverApiException, match="NotFound: Timeline not found"
     ):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
@@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env.pageserver.allowed_errors.extend(
         [
             # the error will be printed to the log too
-            ".*gc target timeline does not exist.*",
+            ".*NotFound: Timeline not found.*",
             # Timelines get stopped during detach, ignore the gc calls that error, witnessing that
             ".*InternalServerError\\(timeline is Stopping.*",
         ]

From e98bc4fd2ba3cb4a3fa6d00f98406fb0fdb916a8 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Sat, 1 Jun 2024 00:18:56 +0100
Subject: [PATCH 29/31] Run gc on too many partial backup segments (#7700)

The general partial backup idea is that each safekeeper keeps only one
partial segment in remote storage at a time. Sometimes this is not true,
for example if we uploaded object to S3 but got an error when tried to
remove the previous upload. In this case we still keep a list of all
potentially uploaded objects in safekeeper state.

This commit prints a warning to logs if there is too many objects in
safekeeper state. This is not expected and we should try to fix this
state, we can do this by running gc.

I haven't seen this being an issue anywhere, but printing a warning is
something that I wanted to do and forgot in initial PR.
---
 safekeeper/src/wal_backup_partial.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index a320be3badf5..6c0f35095b16 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -24,7 +24,7 @@ use rand::Rng;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 
-use tracing::{debug, error, info, instrument};
+use tracing::{debug, error, info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
@@ -308,7 +308,23 @@ pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) {
 
     debug!("state: {:?}", backup.state);
 
+    // The general idea is that each safekeeper keeps only one partial segment
+    // both in remote storage and in local state. If this is not true, something
+    // went wrong.
+    const MAX_SIMULTANEOUS_SEGMENTS: usize = 10;
+
     'outer: loop {
+        if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS {
+            warn!(
+                "too many segments in control_file state, running gc: {}",
+                backup.state.segments.len()
+            );
+
+            backup.gc().await.unwrap_or_else(|e| {
+                error!("failed to run gc: {:#}", e);
+            });
+        }
+
         // wait until we have something to upload
         let uploaded_segment = backup.state.uploaded_segment();
         if let Some(seg) = &uploaded_segment {

From a345cf3fc695282823a7cc2a8711213adb64ec3c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Sat, 1 Jun 2024 12:23:59 +0100
Subject: [PATCH 30/31] Fix span for WAL removal task (#7930)

During refactoring in https://github.com/neondatabase/neon/pull/7887 I
forgot to add "WAL removal" span with ttid. This commit fixes it.
---
 safekeeper/src/timeline_manager.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 84862207d5bf..7174d843fcde 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -10,7 +10,7 @@ use std::{
 
 use postgres_ffi::XLogSegNo;
 use tokio::task::{JoinError, JoinHandle};
-use tracing::{info, instrument, warn};
+use tracing::{info, info_span, instrument, warn, Instrument};
 use utils::lsn::Lsn;
 
 use crate::{
@@ -346,10 +346,13 @@ async fn update_wal_removal(
             &tli.read_shared_state().await.sk.wal_store,
             removal_horizon_segno,
         );
-        *wal_removal_task = Some(tokio::spawn(async move {
-            remover.await?;
-            Ok(removal_horizon_segno)
-        }));
+        *wal_removal_task = Some(tokio::spawn(
+            async move {
+                remover.await?;
+                Ok(removal_horizon_segno)
+            }
+            .instrument(info_span!("WAL removal", ttid=%tli.ttid)),
+        ));
     }
 }
 

From db477c0b8c59081207c1ba7fc6e599b741a0b717 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sun, 2 Jun 2024 16:10:56 +0200
Subject: [PATCH 31/31] Add metrics for Azure blob storage (#7933)

In issue #5590 it was proposed to implement metrics for Azure blob
storage. This PR implements them except for the part that performs the
rename, which is left for a followup.

Closes #5590
---
 libs/remote_storage/src/azure_blob.rs         | 85 ++++++++++++++-----
 libs/remote_storage/src/lib.rs                |  1 +
 .../src/{s3_bucket => }/metrics.rs            | 52 +++++++++---
 libs/remote_storage/src/s3_bucket.rs          | 65 +++++---------
 4 files changed, 125 insertions(+), 78 deletions(-)
 rename libs/remote_storage/src/{s3_bucket => }/metrics.rs (76%)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 24c1248304e1..aca22c6b3eef 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -26,13 +26,14 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
+use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
 use crate::{
-    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
-    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-    TimeTravelError, TimeoutOrCancel,
+    error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing,
+    ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
 };
 
 pub struct AzureBlobStorage {
@@ -137,6 +138,8 @@ impl AzureBlobStorage {
         let mut last_modified = None;
         let mut metadata = HashMap::new();
 
+        let started_at = start_measuring_requests(kind);
+
         let download = async {
             let response = builder
                 // convert to concrete Pageable
@@ -200,13 +203,22 @@ impl AzureBlobStorage {
             })
         };
 
-        tokio::select! {
+        let download = tokio::select! {
             bufs = download => bufs,
             cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
+                TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout),
+                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
             },
-        }
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+        download
     }
 
     async fn permit(
@@ -340,7 +352,10 @@ impl RemoteStorage for AzureBlobStorage {
         metadata: Option<StorageMetadata>,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put, cancel).await?;
+        let kind = RequestKind::Put;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let started_at = start_measuring_requests(kind);
 
         let op = async {
             let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -364,14 +379,25 @@ impl RemoteStorage for AzureBlobStorage {
             match fut.await {
                 Ok(Ok(_response)) => Ok(()),
                 Ok(Err(azure)) => Err(azure.into()),
-                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
             }
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
+
+        res
     }
 
     async fn download(
@@ -417,12 +443,13 @@ impl RemoteStorage for AzureBlobStorage {
         paths: &'a [RemotePath],
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete, cancel).await?;
+        let kind = RequestKind::Delete;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);
 
         let op = async {
-            // TODO batch requests are also not supported by the SDK
+            // TODO batch requests are not supported by the SDK
             // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
             for path in paths {
                 let blob_client = self.client.blob_client(self.relative_path_to_name(path));
 
@@ -447,10 +474,16 @@ impl RemoteStorage for AzureBlobStorage {
             Ok(())
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
-        }
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
     }
 
     async fn copy(
@@ -459,7 +492,9 @@ impl RemoteStorage for AzureBlobStorage {
         to: &RemotePath,
         cancel: &CancellationToken,
     ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy, cancel).await?;
+        let kind = RequestKind::Copy;
+        let _permit = self.permit(kind, cancel).await?;
+        let started_at = start_measuring_requests(kind);
 
         let timeout = tokio::time::sleep(self.timeout);
 
@@ -503,15 +538,21 @@ impl RemoteStorage for AzureBlobStorage {
             }
         };
 
-        tokio::select! {
+        let res = tokio::select! {
             res = op => res,
-            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
             _ = timeout => {
                 let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
                 let e = e.context(format!("Timeout, last status: {copy_status:?}"));
                 Err(e)
             },
-        }
+        };
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+        res
     }
 
     async fn time_travel_recover(
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index cb3df0985d61..8c984abed201 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,6 +12,7 @@
 mod azure_blob;
 mod error;
 mod local_fs;
+mod metrics;
 mod s3_bucket;
 mod simulate_failures;
 mod support;
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/metrics.rs
similarity index 76%
rename from libs/remote_storage/src/s3_bucket/metrics.rs
rename to libs/remote_storage/src/metrics.rs
index beca75592052..bbb51590f344 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -15,6 +15,7 @@ pub(crate) enum RequestKind {
     TimeTravel = 5,
 }
 
+use scopeguard::ScopeGuard;
 use RequestKind::*;
 
 impl RequestKind {
@@ -33,10 +34,10 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 6]);
+pub(crate) struct RequestTyped<C>([C; 6]);
 
 impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
         &self.0[kind.as_index()]
     }
 
@@ -58,19 +59,19 @@ impl<C> RequestTyped<C> {
 }
 
 impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
         self.get(kind).observe(started_at.elapsed().as_secs_f64())
     }
 }
 
-pub(super) struct PassFailCancelledRequestTyped<C> {
+pub(crate) struct PassFailCancelledRequestTyped<C> {
     success: RequestTyped<C>,
     fail: RequestTyped<C>,
     cancelled: RequestTyped<C>,
 }
 
 #[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
+pub(crate) enum AttemptOutcome {
     Ok,
     Err,
     Cancelled,
@@ -86,7 +87,7 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
 }
 
 impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
+    pub(crate) fn as_str(&self) -> &'static str {
         match self {
             AttemptOutcome::Ok => "ok",
             AttemptOutcome::Err => "err",
@@ -96,7 +97,7 @@ impl AttemptOutcome {
 }
 
 impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
         let target = match outcome {
             AttemptOutcome::Ok => &self.success,
             AttemptOutcome::Err => &self.fail,
@@ -119,7 +120,7 @@ impl<C> PassFailCancelledRequestTyped<C> {
 }
 
 impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
+    pub(crate) fn observe_elapsed(
         &self,
         kind: RequestKind,
         outcome: impl Into<AttemptOutcome>,
@@ -130,19 +131,44 @@ impl PassFailCancelledRequestTyped<Histogram> {
     }
 }
 
-pub(super) struct BucketMetrics {
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
+pub(crate) struct BucketMetrics {
     /// Full request duration until successful completion, error or cancellation.
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
     /// Total amount of seconds waited on queue.
-    pub(super) wait_seconds: RequestTyped<Histogram>,
+    pub(crate) wait_seconds: RequestTyped<Histogram>,
 
     /// Track how many semaphore awaits were cancelled per request type.
     ///
     /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
 
     /// Total amount of deleted objects in batches or single requests.
-    pub(super) deleted_objects_total: IntCounter,
+    pub(crate) deleted_objects_total: IntCounter,
 }
 
 impl Default for BucketMetrics {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c3d6c75e20da..76cf3eac80ef 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,15 +46,16 @@ use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
-    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled,
+    metrics::{start_counting_cancelled_wait, start_measuring_requests},
+    support::PermitCarrying,
+    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
+    S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-pub(super) mod metrics;
-
-use self::metrics::AttemptOutcome;
-pub(super) use self::metrics::RequestKind;
+use crate::metrics::AttemptOutcome;
+pub(super) use crate::metrics::RequestKind;
 
 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -227,7 +228,7 @@ impl S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
 
@@ -248,7 +249,7 @@ impl S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .wait_seconds
             .observe_elapsed(kind, started_at);
         Ok(permit)
@@ -287,7 +288,7 @@ impl S3Bucket {
                 // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                 // an error: we expect to sometimes fetch an object and find it missing,
                 // e.g. when probing for timeline indices.
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                     kind,
                     AttemptOutcome::Ok,
                     started_at,
@@ -295,7 +296,7 @@ impl S3Bucket {
                 return Err(DownloadError::NotFound);
             }
             Err(e) => {
-                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                     kind,
                     AttemptOutcome::Err,
                     started_at,
@@ -371,12 +372,12 @@ impl S3Bucket {
             };
 
             let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &resp, started_at);
 
             let resp = resp.context("request deletion")?;
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .deleted_objects_total
                 .inc_by(chunk.len() as u64);
 
@@ -435,14 +436,14 @@ pin_project_lite::pin_project! {
     /// Times and tracks the outcome of the request.
     struct TimedDownload<S> {
         started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
+        outcome: AttemptOutcome,
         #[pin]
         inner: S
     }
 
     impl<S> PinnedDrop for TimedDownload<S> {
         fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
         }
     }
 }
@@ -451,7 +452,7 @@ impl<S> TimedDownload<S> {
     fn new(started_at: std::time::Instant, inner: S) -> Self {
         TimedDownload {
             started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
+            outcome: AttemptOutcome::Cancelled,
             inner,
         }
     }
@@ -468,8 +469,8 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
         let res = ready!(this.inner.poll_next(cx));
         match &res {
             Some(Ok(_)) => {}
-            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
-            None => *this.outcome = metrics::AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = AttemptOutcome::Err,
+            None => *this.outcome = AttemptOutcome::Ok,
         }
 
         Poll::Ready(res)
@@ -543,7 +544,7 @@ impl RemoteStorage for S3Bucket {
 
             let started_at = ScopeGuard::into_inner(started_at);
 
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, &response, started_at);
 
@@ -625,7 +626,7 @@ impl RemoteStorage for S3Bucket {
         if let Ok(inner) = &res {
             // do not incl. timeouts as errors in metrics but cancellations
             let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
+            crate::metrics::BUCKET_METRICS
                 .req_seconds
                 .observe_elapsed(kind, inner, started_at);
         }
@@ -673,7 +674,7 @@ impl RemoteStorage for S3Bucket {
         };
 
         let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
+        crate::metrics::BUCKET_METRICS
             .req_seconds
             .observe_elapsed(kind, &res, started_at);
 
@@ -977,28 +978,6 @@ impl RemoteStorage for S3Bucket {
     }
 }
 
-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
 // Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
 struct VerOrDelete {
     kind: VerOrDeleteKind,