Add long running replication tests

These tests will help verify that replication, both physical and logical, works as expected in Neon. Co-authored-by: Sasha Krassovsky <sasha@neon.tech>
neondatabase · Jul 5, 2024 · 121385c · 121385c
1 parent cbe25b6
commit 121385c
Show file tree

Hide file tree

Showing 3 changed files with 654 additions and 2 deletions.
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
@@ -99,7 +99,7 @@ jobs:
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params: '-m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py -k \"not test_publisher_restart and not test_subscriber_lag\"'
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +125,69 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  replication-tests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_logical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_physical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
@@ -1,13 +1,24 @@
 from __future__ import annotations
 
 import time
+import traceback
+from typing import TYPE_CHECKING
 
+import psycopg2
+import psycopg2.extras
 import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
 from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
     from fixtures.neon_fixtures import NeonEnv, PgBin
+    from fixtures.pg_version import PgVersion
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -31,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql("truncate table pgbench_history")
 
     connstr = endpoint.connstr().replace("'", "''")
-    print(f"connstr='{connstr}'")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
@@ -47,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     assert sum_master == sum_replica
+
+
+def check_pgbench_still_running(pgbench, label=""):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
+
+
+def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
+    start = time.time()
+    pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
+    pub_lsn = Lsn(pub_cur.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
+        res = sub_cur.fetchall()[0][0]
+        if res:
+            log.info(f"subscriber_lsn={res}")
+            sub_lsn = Lsn(res)
+            log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
+            if sub_lsn >= pub_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_subscriber_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts subscriber while still running the inserts, and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        sub_endpoint_id = sub_project["endpoints"][0]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                        sub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            sub_project_id,
+                            sub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(sub_project_id)
+                        sub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-S"],
+                            env=sub_env,
+                        )
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_publisher_restart(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    pub_endpoint_id = pub_project["endpoints"][0]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        pub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            pub_project_id,
+                            pub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(pub_project_id)
+                        pub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                            env=pub_env,
+                        )
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)