Skip to content

Commit

Permalink
tests: refine test_secondary_background_downloads (#7829)
Browse files Browse the repository at this point in the history
## Problem

This test relied on some sleeps, and was failing ~5% of the time.

## Summary of changes

Use log-watching rather than straight waits, and make timeouts more
generous for the CI environment.
  • Loading branch information
jcsp committed May 22, 2024
1 parent ddd8ebd commit 014f822
Showing 1 changed file with 46 additions and 5 deletions.
51 changes: 46 additions & 5 deletions test_runner/regress/test_pageserver_secondary.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
default_download_period_secs = 60

# The upload period, which will also be the download once the secondary has seen its first heatmap
upload_period_secs = 20
upload_period_secs = 30

for _i in range(0, tenant_count):
tenant_id = TenantId.generate()
Expand All @@ -596,43 +596,84 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):

tenant_timelines[tenant_id] = [timeline_a, timeline_b]

def await_log(pageserver, deadline, expression):
"""
Wrapper around assert_log_contains that waits with a deadline rather than timeout
"""
now = time.time()
if now > deadline:
raise RuntimeError(f"Timed out waiting for {expression}")
else:
timeout = int(deadline - now) + 1
try:
wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore
except:
log.error(f"Timed out waiting for '{expression}'")
raise

t_start = time.time()

# Wait long enough that the background downloads should happen; we expect all the inital layers
# of all the initial timelines to show up on the secondary location of each tenant.
time.sleep(default_download_period_secs * 1.5)
initial_download_deadline = time.time() + default_download_period_secs * 3

for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)

now = time.time()
if now > initial_download_deadline:
raise RuntimeError("Timed out waiting for initial secondary download")
else:
for timeline_id in timelines:
log.info(
f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
)
await_log(
ps_secondary,
initial_download_deadline,
f".*{timeline_id}.*Wrote timeline_detail.*",
)

for timeline_id in timelines:
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
log.info(
f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
)
# One or more layers should be present for all timelines
assert ps_secondary.list_layers(tenant_id, timeline_id)

# Delete the second timeline: this should be reflected later on the secondary
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])

# Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
time.sleep(upload_period_secs * 2.5)
deletion_deadline = time.time() + upload_period_secs * 3

for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
ps_attached = env.get_pageserver(attached_to_id)
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)

expect_del_timeline = timelines[1]
log.info(
f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
)
await_log(
ps_secondary,
deletion_deadline,
f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
)

# This one was not deleted
assert ps_secondary.list_layers(tenant_id, timelines[0])

# This one was deleted
log.info(
f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
)
assert not ps_secondary.list_layers(tenant_id, timelines[1])
assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)

t_end = time.time()

Expand Down

1 comment on commit 014f822

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3196 tests run: 3054 passed, 2 failed, 140 skipped (full report)


Failures on Postgres 15

# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_create_snapshot[release-pg15] or test_forward_compatibility[release-pg15]"
Flaky tests (3)

Postgres 16

  • test_timeline_deletion_with_files_stuck_in_upload_queue: debug

Postgres 14

Test coverage report is not available

The comment gets automatically updated with the latest test results
014f822 at 2024-05-22T19:40:17.382Z :recycle:

Please sign in to comment.