Skip to content

Commit

Permalink
SELECT 💣(); (#8270)
Browse files Browse the repository at this point in the history
## Problem
We want to be able to test how our infrastructure reacts on segfaults in
Postgres (for example, we collect cores, and get some required
logs/metrics, etc)

## Summary of changes
- Add `trigger_segfauls` function to `neon_test_utils` to trigger a
segfault in Postgres
- Add `trigger_panic` function to `neon_test_utils` to trigger SIGABRT
(by using `elog(PANIC, ...))
- Fix cleanup logic in regression tests in endpoint crashed
  • Loading branch information
bayandin committed Jul 5, 2024
1 parent 7dd2e44 commit c9fd8d7
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pgxn/neon_test_utils/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ OBJS = \
neontest.o

EXTENSION = neon_test_utils
DATA = neon_test_utils--1.2.sql
DATA = neon_test_utils--1.3.sql
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"

PG_CONFIG = pg_config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,21 @@ CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
RETURNS VOID
AS 'MODULE_PATHNAME', 'neon_xlogflush'
LANGUAGE C PARALLEL UNSAFE;

CREATE FUNCTION trigger_panic()
RETURNS VOID
AS 'MODULE_PATHNAME', 'trigger_panic'
LANGUAGE C PARALLEL UNSAFE;

CREATE FUNCTION trigger_segfault()
RETURNS VOID
AS 'MODULE_PATHNAME', 'trigger_segfault'
LANGUAGE C PARALLEL UNSAFE;

-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
CREATE OR REPLACE FUNCTION 💣() RETURNS void
LANGUAGE plpgsql AS $$
BEGIN
PERFORM trigger_segfault();
END;
$$;
2 changes: 1 addition & 1 deletion pgxn/neon_test_utils/neon_test_utils.control
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# neon_test_utils extension
comment = 'helpers for neon testing and debugging'
default_version = '1.2'
default_version = '1.3'
module_pathname = '$libdir/neon_test_utils'
relocatable = true
trusted = true
23 changes: 23 additions & 0 deletions pgxn/neon_test_utils/neontest.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
PG_FUNCTION_INFO_V1(neon_xlogflush);
PG_FUNCTION_INFO_V1(trigger_panic);
PG_FUNCTION_INFO_V1(trigger_segfault);

/*
* Linkage to functions in neon module.
Expand Down Expand Up @@ -489,3 +491,24 @@ neon_xlogflush(PG_FUNCTION_ARGS)
XLogFlush(lsn);
PG_RETURN_VOID();
}

/*
* Function to trigger panic.
*/
Datum
trigger_panic(PG_FUNCTION_ARGS)
{
elog(PANIC, "neon_test_utils: panic");
PG_RETURN_VOID();
}

/*
* Function to trigger a segfault.
*/
Datum
trigger_segfault(PG_FUNCTION_ARGS)
{
int *ptr = NULL;
*ptr = 42;
PG_RETURN_VOID();
}
18 changes: 14 additions & 4 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,8 @@ def __exit__(
# if the test threw an exception, don't check for errors
# as a failing assertion would cause the cleanup below to fail
ps_assert_metric_no_errors=(exc_type is None),
# do not fail on endpoint errors to allow the rest of cleanup to proceed
fail_on_endpoint_errors=False,
)
cleanup_error = None

Expand Down Expand Up @@ -1214,11 +1216,11 @@ def start(self, timeout_in_seconds: Optional[int] = None):
for f in futs:
f.result()

def stop(self, immediate=False, ps_assert_metric_no_errors=False):
def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
"""
After this method returns, there should be no child processes running.
"""
self.endpoints.stop_all()
self.endpoints.stop_all(fail_on_endpoint_errors)

# Stop storage controller before pageservers: we don't want it to spuriously
# detect a pageserver "failure" during test teardown
Expand Down Expand Up @@ -3899,9 +3901,17 @@ def create(
pageserver_id=pageserver_id,
)

def stop_all(self) -> "EndpointFactory":
def stop_all(self, fail_on_error=True) -> "EndpointFactory":
exception = None
for ep in self.endpoints:
ep.stop()
try:
ep.stop()
except Exception as e:
log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
exception = e

if fail_on_error and exception is not None:
raise exception

return self

Expand Down
23 changes: 23 additions & 0 deletions test_runner/regress/test_endpoint_crash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder


@pytest.mark.parametrize(
"sql_func",
[
"trigger_panic",
"trigger_segfault",
"💣", # calls `trigger_segfault` internally
],
)
def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
"""
Test that triggering crash from neon_test_utils crashes the endpoint
"""
env = neon_env_builder.init_start()
env.neon_cli.create_branch("test_endpoint_crash")
endpoint = env.endpoints.create_start("test_endpoint_crash")

endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
endpoint.safe_psql(f"SELECT {sql_func}();")

1 comment on commit c9fd8d7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3129 tests run: 3001 passed, 1 failed, 127 skipped (full report)


Failures on Postgres 14

  • test_basebackup_with_high_slru_count[github-actions-selfhosted-vectored-10-13-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-vectored-10-13-30]"
Flaky tests (2)

Postgres 16

  • test_secondary_background_downloads: release
  • test_tenant_creation_fails: debug

Code coverage* (full report)

  • functions: 32.6% (6932 of 21275 functions)
  • lines: 50.0% (54485 of 108968 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
c9fd8d7 at 2024-07-05T15:39:15.202Z :recycle:

Please sign in to comment.