From 1ea5d8b1327d2e93cbe11682f60a90e35d42d1ee Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 24 Jun 2024 18:03:53 +0100 Subject: [PATCH] tests: accomodate some messages that can fail tests (#8144) ## Problem - `test_storage_controller_many_tenants` can fail with warnings in the storage controller about tenant creation holding a lock for too long, because this test stresses the machine running the test with many concurrent timeline creations - `test_tenant_delete_smoke` can fail when synthetic remote storage errors show up ## Summary of changes - tolerate warnings about slow timeline creation in test_storage_controller_many_tenants - tolerate both possible errors during error_tolerant_delete --- .../performance/test_storage_controller_scale.py | 11 ++++++++++- test_runner/regress/test_tenant_delete.py | 8 ++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index cb013ae8c3e3..a4c8c8ac421a 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -48,7 +48,16 @@ def test_storage_controller_many_tenants( # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. - env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile") + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + ".*Many shards are waiting to reconcile", + # We will create many timelines concurrently, so they might get slow enough to trip the warning + # that timeline creation is holding a lock too long. + ".*Shared lock by TimelineCreate.*was held.*", + ] + ) for ps in env.pageservers: # This can happen because when we do a loop over all pageservers and mark them offline/active, diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index a3316f2f4592..d3fba32a19e0 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -31,8 +31,12 @@ def error_tolerant_delete(ps_http, tenant_id): if e.status_code == 500: # This test uses failure injection, which can produce 500s as the pageserver expects # the object store to always be available, and the ListObjects during deletion is generally - # an infallible operation - assert "simulated failure of remote operation" in e.message + # an infallible operation. This can show up as a clear simulated error, or as a general + # error during delete_objects() + assert ( + "simulated failure of remote operation" in e.message + or "failed to delete" in e.message + ) else: raise else: