From 00d1dbdd23b949f7c5565869ad35860c9f64537f Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 4 Jul 2024 19:16:46 +0100 Subject: [PATCH] test_runner: fix endpoints cleanup --- test_runner/fixtures/neon_fixtures.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c002e11c1c08..5fb4d948175f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -943,6 +943,8 @@ def __exit__( # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), + # do not fail on endpoint errors to allow the rest of cleanup to proceed + fail_on_endpoint_errors=False, ) cleanup_error = None @@ -1214,11 +1216,11 @@ def start(self, timeout_in_seconds: Optional[int] = None): for f in futs: f.result() - def stop(self, immediate=False, ps_assert_metric_no_errors=False): + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. """ - self.endpoints.stop_all() + self.endpoints.stop_all(fail_on_endpoint_errors) # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown @@ -3899,9 +3901,17 @@ def create( pageserver_id=pageserver_id, ) - def stop_all(self) -> "EndpointFactory": + def stop_all(self, fail_on_error=True) -> "EndpointFactory": + exception = None for ep in self.endpoints: - ep.stop() + try: + ep.stop() + except Exception as e: + log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") + exception = e + + if fail_on_error and exception is not None: + raise exception return self