From 46b5b4c1ad5713bf6107790f672c316ce1d03395 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 29 Feb 2024 09:23:29 +0000 Subject: [PATCH 01/13] bench: run branch_creation_many at 500, seeded --- test_runner/performance/test_branch_creation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 6edcb8f1f2fa..a2df3801a383 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -89,11 +89,14 @@ def run_pgbench(branch: str): _record_branch_creation_durations(neon_compare, branch_creation_durations) -@pytest.mark.parametrize("n_branches", [1024]) +@pytest.mark.parametrize("n_branches", [500, 1024]) # Test measures the latency of branch creation when creating a lot of branches. def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): env = neon_compare.env + # seed the prng so we will measure the same structure every time + random.seed("2024-02-29") + env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") From 9f6615168a86402e7f4a60fd229ec00b2b91ba79 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 29 Feb 2024 11:22:38 +0000 Subject: [PATCH 02/13] chore: move the comment --- test_runner/performance/test_branch_creation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index a2df3801a383..333090db84da 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -90,8 +90,10 @@ def run_pgbench(branch: str): @pytest.mark.parametrize("n_branches", [500, 1024]) -# Test measures the latency of branch creation when creating a lot of branches. def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): + """ + Test measures the latency of branch creation when creating a lot of branches. + """ env = neon_compare.env # seed the prng so we will measure the same structure every time From aecd3528d781b44178a92339c9bd924d33be8f19 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 29 Feb 2024 11:22:44 +0000 Subject: [PATCH 03/13] repeated runs --- test_runner/performance/test_branch_creation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 333090db84da..3ad37ba916f8 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -89,6 +89,7 @@ def run_pgbench(branch: str): _record_branch_creation_durations(neon_compare, branch_creation_durations) +@pytest.mark.repeat(5) @pytest.mark.parametrize("n_branches", [500, 1024]) def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): """ From 3b0fa87555f17be26a439f61d4e4962053015c58 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 1 Mar 2024 16:56:45 +0000 Subject: [PATCH 04/13] use benchmark local rng --- test_runner/performance/test_branch_creation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 3ad37ba916f8..32a740356745 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -98,7 +98,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): env = neon_compare.env # seed the prng so we will measure the same structure every time - random.seed("2024-02-29") + rng = random.Random("2024-02-29") env.neon_cli.create_branch("b0") @@ -109,7 +109,8 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): for i in range(n_branches): # random a source branch - p = random.randint(0, i) + p = rng.randint(0, i) + timer = timeit.default_timer() env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) dur = timeit.default_timer() - timer From 2743422fdbbfed573ff3520c8b2e8b76e24a5407 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 1 Mar 2024 16:57:21 +0000 Subject: [PATCH 05/13] complicate the shapes --- test_runner/performance/test_branch_creation.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 32a740356745..3ace364a85bc 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -91,7 +91,8 @@ def run_pgbench(branch: str): @pytest.mark.repeat(5) @pytest.mark.parametrize("n_branches", [500, 1024]) -def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): +@pytest.mark.parametrize("shape", ["one_ancestor", "random"]) +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): """ Test measures the latency of branch creation when creating a lot of branches. """ @@ -108,11 +109,16 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): branch_creation_durations = [] for i in range(n_branches): - # random a source branch - p = rng.randint(0, i) + if shape == "random": + parent = f"b{rng.randint(0, i)}" + elif shape == "one_ancestor": + parent = "b0" + else: + raise RuntimeError(f"unimplemented shape: {shape}") timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) + # each of these uploads to remote storage before completion + env.neon_cli.create_branch(f"b{i + 1}", parent) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) From 87e9b7c790c262b8b0614f4333dfe1a744f08f5d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 1 Mar 2024 16:57:32 +0000 Subject: [PATCH 06/13] record startup metrics --- .../performance/test_branch_creation.py | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 3ace364a85bc..0057b129848a 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -7,11 +7,14 @@ from typing import List import pytest -from fixtures.benchmark_fixture import MetricReport +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn +from fixtures.utils import wait_until +from prometheus_client.samples import Sample def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): @@ -124,6 +127,50 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: _record_branch_creation_durations(neon_compare, branch_creation_durations) + endpoint.stop_and_destroy() + + env.pageserver.restart() + env.pageserver.quiesce_tenants() + + wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + + +def wait_and_record_startup_metrics( + pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str +): + """ + Waits until all startup metrics have non-zero values on the pageserver, then records them on the target + """ + + client = pageserver.http_client() + + expected_labels = set( + [ + "background_jobs_can_start", + "complete", + "initial", + "initial_tenant_load", + "initial_tenant_load_remote", + ] + ) + + def metrics_are_filled() -> List[Sample]: + m = client.get_metrics() + samples = m.query_all("pageserver_startup_duration_seconds") + # we should not have duplicate labels + matching = [ + x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0 + ] + assert len(matching) == len(expected_labels) + return matching + + samples = wait_until(10, 1, metrics_are_filled) + + for sample in samples: + phase = sample.labels["phase"] + name = f"{prefix}.{phase}" + target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER) + # Test measures the branch creation time when branching from a timeline with a lot of relations. # From 45e0ecfedb2580bb184317fcfa857d250508d59c Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 1 Mar 2024 17:25:09 +0000 Subject: [PATCH 07/13] remove repeat --- test_runner/performance/test_branch_creation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 0057b129848a..38f17258d550 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -92,7 +92,6 @@ def run_pgbench(branch: str): _record_branch_creation_durations(neon_compare, branch_creation_durations) -@pytest.mark.repeat(5) @pytest.mark.parametrize("n_branches", [500, 1024]) @pytest.mark.parametrize("shape", ["one_ancestor", "random"]) def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): From e727ce0f8b5c4e971d6c3fd3359fec9b1d5dc126 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 6 Mar 2024 12:28:36 +0000 Subject: [PATCH 08/13] feat: measure shutdown time --- test_runner/performance/test_branch_creation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 38f17258d550..6710ec4e35b3 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -128,7 +128,11 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: endpoint.stop_and_destroy() - env.pageserver.restart() + with neon_compare.record_duration("pageserver_shutdown"): + # this sleeps 100ms between polls + env.pageserver.stop() + + env.pageserver.start() env.pageserver.quiesce_tenants() wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") From b6276e90ce390918ad7dd12c64d6c4b8b5d953db Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 6 Mar 2024 14:17:28 +0000 Subject: [PATCH 09/13] record first compaction duration (with debug logging) --- pageserver/src/tenant/tasks.rs | 7 +++- .../performance/test_branch_creation.py | 42 ++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 57c3edcddd29..e4f5f7513288 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -101,6 +101,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) + // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) @@ -198,7 +199,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction); + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 6710ec4e35b3..8fa1734656e6 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,8 +1,10 @@ import random +import re import statistics import threading import time import timeit +import uuid from contextlib import closing from typing import List @@ -132,11 +134,49 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # this sleeps 100ms between polls env.pageserver.stop() - env.pageserver.start() + _, first_start = wait_until( + 5, 1, lambda: env.pageserver.assert_log_contains("INFO version: git:") + ) + + # start without gc so we can time compaction with less noise; use shorter + # period for compaction so it starts earlier + env.pageserver.start( + overrides=( + "--pageserver-config-override=tenant_config={ compaction_period = '1s', gc_period = '0s' }", + ), + # this does print more than we want, but the number should be comparable between runs + extra_env_vars={ + "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" + }, + ) + + _, second_start = wait_until( + 5, 1, lambda: env.pageserver.assert_log_contains("INFO version: git:", first_start) + ) env.pageserver.quiesce_tenants() + http_client = env.pageserver.http_client() + marker = uuid.uuid4().hex + http_client.post_tracing_event("info", marker) + + _, position = wait_until( + 10, 1, lambda: env.pageserver.assert_log_contains(marker, second_start) + ) + wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + msg, _ = wait_until( + 30, + 1, + lambda: env.pageserver.assert_log_contains( + f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", position + ), + ) + needle = re.search(" elapsed_ms=([0-9]+)", msg) + assert needle is not None, "failed to find the elapsed time" + duration = int(needle.group(1)) / 1000.0 + neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER) + def wait_and_record_startup_metrics( pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str From bd7654ce3281d53ff11e6a02408ac3d55878d698 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 6 Mar 2024 14:17:44 +0000 Subject: [PATCH 10/13] rename shutdown recording name --- test_runner/performance/test_branch_creation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 8fa1734656e6..720e70c95ec3 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -130,7 +130,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: endpoint.stop_and_destroy() - with neon_compare.record_duration("pageserver_shutdown"): + with neon_compare.record_duration("shutdown"): # this sleeps 100ms between polls env.pageserver.stop() From 07916057dbc15bca498e86d50e3d8f4e9623ad71 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 7 Mar 2024 12:22:09 +0000 Subject: [PATCH 11/13] fix: start line is different on ci and locally --- test_runner/performance/test_branch_creation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 720e70c95ec3..4f4024af0b45 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -134,8 +134,11 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # this sleeps 100ms between polls env.pageserver.stop() + startup_line = "INFO version: git(-env)?:" + + # find the first line of the log file so we can find the next start later _, first_start = wait_until( - 5, 1, lambda: env.pageserver.assert_log_contains("INFO version: git:") + 5, 1, lambda: env.pageserver.assert_log_contains(startup_line) ) # start without gc so we can time compaction with less noise; use shorter @@ -151,7 +154,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: ) _, second_start = wait_until( - 5, 1, lambda: env.pageserver.assert_log_contains("INFO version: git:", first_start) + 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) ) env.pageserver.quiesce_tenants() From f2b4332f64d0963b1f826a322d5c7aa99a30ac26 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 7 Mar 2024 12:22:36 +0000 Subject: [PATCH 12/13] set higher compaction period --- test_runner/performance/test_branch_creation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 4f4024af0b45..92c4abd31043 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -145,7 +145,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # period for compaction so it starts earlier env.pageserver.start( overrides=( - "--pageserver-config-override=tenant_config={ compaction_period = '1s', gc_period = '0s' }", + "--pageserver-config-override=tenant_config={ compaction_period = '3s', gc_period = '0s' }", ), # this does print more than we want, but the number should be comparable between runs extra_env_vars={ @@ -168,6 +168,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + # wait for compaction to complete, which most likely has already done so multiple times msg, _ = wait_until( 30, 1, From 8e2913dcc784a6023b99c329ca4658c725430993 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 7 Mar 2024 12:31:13 +0000 Subject: [PATCH 13/13] remove unnecessary marker posting and finding it is unnecessary because we know we are on the second run already. --- test_runner/performance/test_branch_creation.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 92c4abd31043..9777bf67489c 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -4,7 +4,6 @@ import threading import time import timeit -import uuid from contextlib import closing from typing import List @@ -137,9 +136,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: startup_line = "INFO version: git(-env)?:" # find the first line of the log file so we can find the next start later - _, first_start = wait_until( - 5, 1, lambda: env.pageserver.assert_log_contains(startup_line) - ) + _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier @@ -158,14 +155,6 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: ) env.pageserver.quiesce_tenants() - http_client = env.pageserver.http_client() - marker = uuid.uuid4().hex - http_client.post_tracing_event("info", marker) - - _, position = wait_until( - 10, 1, lambda: env.pageserver.assert_log_contains(marker, second_start) - ) - wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") # wait for compaction to complete, which most likely has already done so multiple times @@ -173,7 +162,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: 30, 1, lambda: env.pageserver.assert_log_contains( - f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", position + f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start ), ) needle = re.search(" elapsed_ms=([0-9]+)", msg)