Skip to content

Commit

Permalink
tests: remove allow-list entries for stale generation messages
Browse files Browse the repository at this point in the history
  • Loading branch information
jcsp committed Jul 3, 2024
1 parent 545a3fa commit 4bb2e5f
Show file tree
Hide file tree
Showing 17 changed files with 0 additions and 102 deletions.
4 changes: 0 additions & 4 deletions test_runner/fixtures/pageserver/many_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ def single_timeline(

log.info("detach template tenant form pageserver")
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)

log.info(f"duplicating template tenant {ncopies} times in S3")
tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@ def setup_template(env: NeonEnv):
}
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)
env.pageserver.tenant_attach(template_tenant, config)
ep = env.endpoints.create_start("main", tenant_id=template_tenant)
ep.safe_psql("create table foo(b text)")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):

template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)
env.pageserver.tenant_attach(template_tenant, config)

ps_http = env.pageserver.http_client()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
}
template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
env.pageserver.tenant_detach(template_tenant)
env.pageserver.allowed_errors.append(
# tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
".*Dropped remote consistent LSN updates.*",
)
env.pageserver.tenant_attach(template_tenant, config)
ps_http = env.pageserver.http_client()
with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
Expand Down
8 changes: 0 additions & 8 deletions test_runner/performance/test_storage_controller_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,6 @@ def test_storage_controller_many_tenants(
)

for ps in env.pageservers:
# This can happen because when we do a loop over all pageservers and mark them offline/active,
# reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
# bumping generation before other attachments are detached.
#
# We could clean this up by making reconcilers respect the .observed of their predecessor, if
# we spawn with a wait for the predecessor.
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

# Storage controller is allowed to drop pageserver requests when the cancellation token
# for a Reconciler fires.
ps.allowed_errors.append(".*request was dropped before completing.*")
Expand Down
6 changes: 0 additions & 6 deletions test_runner/regress/test_attach_tenant_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
[
# eviction might be the first one after an attach to access the layers
".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
# detach can happen before we get to validate the generation number
".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
]
)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
Expand Down Expand Up @@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N

env.pageserver.allowed_errors.extend(
[
# This fixture detaches the tenant, and tests using it will tend to re-attach it
# shortly after. There may be un-processed deletion_queue validations from the
# initial attachment
".*Dropped remote consistent LSN updates.*",
# This fixture is for tests that will intentionally generate 400 responses
".*Error processing HTTP request: Bad request",
]
Expand Down
5 changes: 0 additions & 5 deletions test_runner/regress/test_change_pageserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
)
env = neon_env_builder.init_start()

for pageserver in env.pageservers:
# This test dual-attaches a tenant, one of the pageservers will therefore
# be running with a stale generation.
pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

env.neon_cli.create_branch("test_change_pageserver")
endpoint = env.endpoints.create_start("test_change_pageserver")

Expand Down
3 changes: 0 additions & 3 deletions test_runner/regress/test_layers_from_future.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):

env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

ps_http = env.pageserver.http_client()

Expand Down
13 changes: 0 additions & 13 deletions test_runner/regress/test_pageserver_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

main_pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

# Now advance the generation in the control plane: subsequent validations
# from the running pageserver will fail. No more deletions should happen.
env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
Expand Down Expand Up @@ -397,8 +393,6 @@ def assert_deletions_submitted(n: int) -> None:
# validated before restart.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

# If we lost the attachment, we should have dropped our pre-restart deletions.
assert get_deletion_queue_dropped(ps_http) == before_restart_depth

Expand Down Expand Up @@ -553,13 +547,6 @@ def test_multi_attach(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline

# We will intentionally create situations where stale deletions happen from non-latest-generation
# nodes when the tenant is multiply-attached
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

# Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
Expand Down
3 changes: 0 additions & 3 deletions test_runner/regress/test_pageserver_secondary.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
for ps in env.pageservers:
ps.allowed_errors.extend(
[
# We will make no effort to avoid stale attachments
".*Dropped remote consistent LSN updates.*",
".*Dropping stale deletions.*",
# page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
".*query handler.*Tenant.*not found.*",
# page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
Expand Down
7 changes: 0 additions & 7 deletions test_runner/regress/test_remote_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,13 +355,6 @@ def churn_while_failpoints_active(result):
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()

# We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before
# we later increment when actually attaching it again, leading to skipping a generation and potentially getting
# these warnings if there was a durable but un-executed deletion list at time of restart.
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)

dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
Expand Down
4 changes: 0 additions & 4 deletions test_runner/regress/test_sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,10 +1144,6 @@ def test_sharding_split_failures(
)

for ps in env.pageservers:
# When we do node failures and abandon a shard, it will de-facto have old generation and
# thereby be unable to publish remote consistent LSN updates
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

# If we're using a failure that will panic the storage controller, all background
# upcalls from the pageserver can fail
ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
Expand Down
15 changes: 0 additions & 15 deletions test_runner/regress/test_storage_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,6 @@ def test_storage_controller_smoke(
neon_env_builder.num_pageservers = 3
env = neon_env_builder.init_configs()

for pageserver in env.pageservers:
# This test detaches tenants during migration, which can race with deletion queue operations,
# during detach we only do an advisory flush, we don't wait for it.
pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])

# Start services by hand so that we can skip a pageserver (this will start + register later)
env.broker.try_start()
env.storage_controller.start()
Expand Down Expand Up @@ -484,9 +479,6 @@ def handler(request: Request):
# Start running
env = neon_env_builder.init_start()

# We will to an unclean migration, which will result in deletion queue warnings
env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")

# Initial notification from tenant creation
assert len(notifications) == 1
expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
Expand Down Expand Up @@ -1054,13 +1046,6 @@ def tenants_placed():
online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids

for node_id in offline_node_ids:
env.get_pageserver(node_id).allowed_errors.append(
# In the case of the failpoint failure, the impacted pageserver
# still believes it has the tenant attached since location
# config calls into it will fail due to being marked offline.
".*Dropped remote consistent LSN updates.*",
)

if len(offline_node_ids) > 1:
env.get_pageserver(node_id).allowed_errors.append(
".*Scheduling error when marking pageserver.*offline.*",
Expand Down
4 changes: 0 additions & 4 deletions test_runner/regress/test_tenant_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):

assert not config_path.exists(), "detach did not remove config file"

# The re-attach's increment of the generation number may invalidate deletion queue
# updates in flight from the previous attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

env.pageserver.tenant_attach(tenant_id)
wait_until(
number_of_iterations=5,
Expand Down
12 changes: 0 additions & 12 deletions test_runner/regress/test_tenant_detach.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):

env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t(key int primary key, value text)")
Expand Down Expand Up @@ -349,10 +345,6 @@ def test_detach_while_attaching(

env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
Expand Down Expand Up @@ -422,10 +414,6 @@ def test_detach_while_activating(

env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

# Our re-attach may race with the deletion queue processing LSN updates
# from the original attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")

data_id = 1
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
Expand Down
2 changes: 0 additions & 2 deletions test_runner/regress/test_tenant_relocation.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,6 @@ def test_tenant_relocation(
[
# Needed for detach polling on the original pageserver
f".*NotFound: tenant {tenant_id}.*",
# We will dual-attach in this test, so stale generations are expected
".*Dropped remote consistent LSN updates.*",
]
)

Expand Down
4 changes: 0 additions & 4 deletions test_runner/regress/test_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
# generation nubmers out of order.
env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")

# Our multiple creation requests will advance generation quickly, and when we skip
# a generation number we can generate these warnings
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")

# Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
# an incomplete attach, or some other problem. In the field this should be rare,
# so we allow it to log at WARN, even if it is occasionally a false positive.
Expand Down

0 comments on commit 4bb2e5f

Please sign in to comment.