Skip to content

Commit

Permalink
Add long running replication tests
Browse files Browse the repository at this point in the history
These tests will help verify that replication, both physical and
logical, works as expected in Neon.

Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
  • Loading branch information
tristan957 and save-buffer committed Jul 5, 2024
1 parent cbe25b6 commit 121385c
Show file tree
Hide file tree
Showing 3 changed files with 654 additions and 2 deletions.
65 changes: 64 additions & 1 deletion .github/workflows/benchmarking.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
# Set --sparse-ordering option of pytest-order plugin
# to ensure tests are running in order of appears in the file.
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
extra_params: '-m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py -k \"not test_publisher_restart and not test_subscriber_lag\"'
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
Expand All @@ -125,6 +125,69 @@ jobs:
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

replication-tests:
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"

runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
options: --init

steps:
- uses: actions/checkout@v4

- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-release-artifact
path: /tmp/neon/
prefix: latest

- name: Run benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_logical_replication.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}

- name: Run benchmark
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance/test_physical_replication.py
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}

- name: Create Allure report
if: ${{ !cancelled() }}
uses: ./.github/actions/allure-report-generate

- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

generate-matrices:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
Expand Down
295 changes: 294 additions & 1 deletion test_runner/performance/test_logical_replication.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
from __future__ import annotations

import time
import traceback
from typing import TYPE_CHECKING

import psycopg2
import psycopg2.extras
import pytest
from fixtures.benchmark_fixture import MetricReport
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_api import connection_parameters_to_env
from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
from fixtures.pg_version import PgVersion

if TYPE_CHECKING:
from fixtures.benchmark_fixture import NeonBenchmarker
from fixtures.neon_api import NeonAPI
from fixtures.neon_fixtures import NeonEnv, PgBin
from fixtures.pg_version import PgVersion


@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
Expand All @@ -31,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
vanilla_pg.safe_psql("truncate table pgbench_history")

connstr = endpoint.connstr().replace("'", "''")
print(f"connstr='{connstr}'")
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")

# Wait logical replication channel to be established
Expand All @@ -47,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
assert sum_master == sum_replica


def check_pgbench_still_running(pgbench, label=""):
rc = pgbench.poll()
if rc is not None:
raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")


def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
start = time.time()
pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
pub_lsn = Lsn(pub_cur.fetchall()[0][0])
while (time.time() - start) < timeout_sec:
sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
res = sub_cur.fetchall()[0][0]
if res:
log.info(f"subscriber_lsn={res}")
sub_lsn = Lsn(res)
log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
if sub_lsn >= pub_lsn:
return time.time() - start
time.sleep(0.5)
raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")


@pytest.mark.remote_cluster
@pytest.mark.timeout(2 * 60 * 60)
def test_subscriber_lag(
pg_bin: PgBin,
neon_api: NeonAPI,
pg_version: PgVersion,
zenbenchmark: NeonBenchmarker,
):
"""
Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
on subscriber. Periodically restarts subscriber while still running the inserts, and
measures how long sync takes after restart.
"""
test_duration_min = 60
sync_interval_min = 5
pgbench_duration = f"-T{test_duration_min * 60 * 2}"

pub_project = neon_api.create_project(pg_version)
pub_project_id = pub_project["project"]["id"]
neon_api.wait_for_operation_to_finish(pub_project_id)
error_occurred = False
try:
sub_project = neon_api.create_project(pg_version)
sub_project_id = sub_project["project"]["id"]
sub_endpoint_id = sub_project["endpoints"][0]["id"]
neon_api.wait_for_operation_to_finish(sub_project_id)
try:
pub_env = connection_parameters_to_env(
pub_project["connection_uris"][0]["connection_parameters"]
)
sub_env = connection_parameters_to_env(
sub_project["connection_uris"][0]["connection_parameters"]
)
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]

pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)

pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
sub_cur.execute("truncate table pgbench_accounts")
sub_cur.execute("truncate table pgbench_history")

pub_cur.execute(
"create publication pub1 for table pgbench_accounts, pgbench_history"
)
sub_cur.execute(
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
)

initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()

zenbenchmark.record(
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
)

pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
)
try:
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
try:
start = time.time()
while time.time() - start < test_duration_min * 60:
time.sleep(sync_interval_min * 60)
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")

with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
sub_connstr
) as sub_conn:
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)

log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
sub_workload.terminate()
neon_api.restart_endpoint(
sub_project_id,
sub_endpoint_id,
)
neon_api.wait_for_operation_to_finish(sub_project_id)
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)

# Measure storage to make sure replication information isn't bloating storage
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
"synthetic_storage_size"
]
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
"synthetic_storage_size"
]
zenbenchmark.record(
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
)
zenbenchmark.record(
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
)

finally:
sub_workload.terminate()
finally:
pub_workload.terminate()
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
finally:
if not error_occurred:
neon_api.delete_project(sub_project_id)
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
finally:
assert not error_occurred
neon_api.delete_project(pub_project_id)


@pytest.mark.remote_cluster
@pytest.mark.timeout(2 * 60 * 60)
def test_publisher_restart(
pg_bin: PgBin,
neon_api: NeonAPI,
pg_version: PgVersion,
zenbenchmark: NeonBenchmarker,
):
"""
Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
measures how long sync takes after restart.
"""
test_duration_min = 60
sync_interval_min = 5
pgbench_duration = f"-T{test_duration_min * 60 * 2}"

pub_project = neon_api.create_project(pg_version)
pub_project_id = pub_project["project"]["id"]
pub_endpoint_id = pub_project["endpoints"][0]["id"]
neon_api.wait_for_operation_to_finish(pub_project_id)
error_occurred = False
try:
sub_project = neon_api.create_project(pg_version)
sub_project_id = sub_project["project"]["id"]
neon_api.wait_for_operation_to_finish(sub_project_id)
try:
pub_env = connection_parameters_to_env(
pub_project["connection_uris"][0]["connection_parameters"]
)
sub_env = connection_parameters_to_env(
sub_project["connection_uris"][0]["connection_parameters"]
)
pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
sub_connstr = sub_project["connection_uris"][0]["connection_uri"]

pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)

pub_conn = psycopg2.connect(pub_connstr)
sub_conn = psycopg2.connect(sub_connstr)
pub_conn.autocommit = True
sub_conn.autocommit = True
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
sub_cur.execute("truncate table pgbench_accounts")
sub_cur.execute("truncate table pgbench_history")

pub_cur.execute(
"create publication pub1 for table pgbench_accounts, pgbench_history"
)
sub_cur.execute(
f"create subscription sub1 connection '{pub_connstr}' publication pub1"
)

initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
pub_conn.close()
sub_conn.close()

zenbenchmark.record(
"initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
)

pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
)
try:
sub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-S"],
env=sub_env,
)
try:
start = time.time()
while time.time() - start < test_duration_min * 60:
time.sleep(sync_interval_min * 60)
check_pgbench_still_running(pub_workload, "pub")
check_pgbench_still_running(sub_workload, "sub")

pub_workload.terminate()
neon_api.restart_endpoint(
pub_project_id,
pub_endpoint_id,
)
neon_api.wait_for_operation_to_finish(pub_project_id)
pub_workload = pg_bin.run_nonblocking(
["pgbench", "-c10", pgbench_duration, "-Mprepared"],
env=pub_env,
)
with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
sub_connstr
) as sub_conn:
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
lag = measure_logical_replication_lag(sub_cur, pub_cur)

log.info(f"Replica lagged behind master by {lag} seconds")
zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)

# Measure storage to make sure replication information isn't bloating storage
sub_storage = neon_api.get_project_details(sub_project_id)["project"][
"synthetic_storage_size"
]
pub_storage = neon_api.get_project_details(pub_project_id)["project"][
"synthetic_storage_size"
]
zenbenchmark.record(
"sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
)
zenbenchmark.record(
"pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
)

finally:
sub_workload.terminate()
finally:
pub_workload.terminate()
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
finally:
if not error_occurred:
neon_api.delete_project(sub_project_id)
except Exception as e:
error_occurred = True
log.error(f"Caught exception {e}")
log.error(traceback.format_exc())
finally:
assert not error_occurred
neon_api.delete_project(pub_project_id)
Loading

0 comments on commit 121385c

Please sign in to comment.