Skip to content

Commit

Permalink
test_local_only_layers_after_crash: various fixes (#7986)
Browse files Browse the repository at this point in the history
In #7927 I needed to fix this test case, but the fixes should be
possible to land irrespective of the layer ingestion code change.

The most important fix is the behavior if an image layer is found: the
assertion message formatting raises a runtime error, which obscures the
fact that we found an image layer.
  • Loading branch information
koivunej committed Jun 7, 2024
1 parent 66c6b27 commit 8ee191c
Showing 1 changed file with 13 additions and 16 deletions.
29 changes: 13 additions & 16 deletions test_runner/regress/test_pageserver_crash_consistency.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import time

import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.common_types import ImageLayerName, parse_layer_file_name
from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
wait_for_upload_queue_empty,
wait_until_tenant_active,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
Expand All @@ -25,10 +22,9 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:

env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_distance": f"{1024 ** 2}",
"compaction_target_size": f"{1024 ** 2}",
"checkpoint_distance": f"{10 * 1024**2}",
"compaction_period": "0 s",
"compaction_threshold": "3",
"compaction_threshold": "999999",
}
)
pageserver_http = env.pageserver.http_client()
Expand All @@ -42,13 +38,13 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])

lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
endpoint.stop()

# make sure we receive no new wal after this, so that we'll write over the same L1 file.
endpoint.stop()
for sk in env.safekeepers:
sk.stop()

pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3})
# hit the exit failpoint
with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
Expand All @@ -72,9 +68,15 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
# L0
continue

candidate = parse_layer_file_name(path.name)

if isinstance(candidate, ImageLayerName):
continue

if l1_found is not None:
raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}")
l1_found = parse_layer_file_name(path.name)
raise RuntimeError(f"found multiple L1: {l1_found.to_str()} and {path.name}")

l1_found = candidate

assert l1_found is not None, "failed to find L1 locally"

Expand All @@ -93,15 +95,10 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
# wait for us to catch up again
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)

pageserver_http.timeline_compact(tenant_id, timeline_id)

# give time for log flush
time.sleep(1)
pageserver_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)

assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears"

wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)

uploaded = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, l1_found.to_str()
)
Expand Down

1 comment on commit 8ee191c

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3280 tests run: 3123 passed, 5 failed, 152 skipped (full report)


Failures on Postgres 15

  • test_storage_controller_smoke: debug

Failures on Postgres 14

  • test_pgbench_intensive_init_workload[neon_on-github-actions-selfhosted-1000]: release
  • test_sharding_autosplit[github-actions-selfhosted]: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-sequential-10-13-30]: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-vectored-10-13-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_pgbench_intensive_init_workload[neon_on-release-pg14-github-actions-selfhosted-1000] or test_sharding_autosplit[release-pg14-github-actions-selfhosted] or test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-sequential-10-13-30] or test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-vectored-10-13-30] or test_storage_controller_smoke[debug-pg15]"
Flaky tests (1)

Postgres 15

  • test_storage_controller_smoke: release

Test coverage report is not available

The comment gets automatically updated with the latest test results
8ee191c at 2024-06-07T08:42:16.029Z :recycle:

Please sign in to comment.