Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pageserver: return ACCEPTED when deletion already in flight #7384

Merged
merged 5 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1410,9 +1410,15 @@ impl TenantManager {

match tenant.current_state() {
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// If a tenant is broken or stopping, DeleteTenantFlow can
// handle it: broken tenants proceed to delete, stopping tenants
// are checked for deletion already in progress.
// If deletion is already in progress, return success (the semantics of this
// function are to rerturn success afterr deletion is spawned in background).
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
if tenant.delete_progress.try_lock().is_err() {
// The `delete_progress` lock is held: deletion is already happening
// in the bacckground
slot_guard.revert();
return Ok(());
}
jcsp marked this conversation as resolved.
Show resolved Hide resolved
}
_ => {
tenant
Expand Down
20 changes: 8 additions & 12 deletions test_runner/regress/test_tenant_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,14 +484,10 @@ def test_tenant_delete_concurrent(
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)

CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken"

env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
# Errors logged from our 4xx requests
f".*{CONFLICT_MESSAGE}.*",
]
)

Expand All @@ -509,20 +505,20 @@ def delete_tenant():
def hit_remove_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")

def hit_run_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
def hit_run_failpoint(cursor=None):
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=cursor)

with concurrent.futures.ThreadPoolExecutor() as executor:
background_200_req = executor.submit(delete_tenant)
assert background_200_req.result(timeout=10).status_code == 202

# Wait until the first request completes its work and is blocked on removing
# the TenantSlot from tenant manager.
wait_until(100, 0.1, hit_remove_failpoint)
log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
jcsp marked this conversation as resolved.
Show resolved Hide resolved

# Start another request: this should fail when it sees a tenant in Stopping state
with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
ps_http.tenant_delete(tenant_id)
# Start another request: this should succeed without actually entering the deletion code
ps_http.tenant_delete(tenant_id)
assert not hit_run_failpoint(log_cursor)

# Start another background request, which will pause after acquiring a TenantSlotGuard
# but before completing.
Expand All @@ -539,8 +535,8 @@ def hit_run_failpoint():

# Permit the duplicate background request to run to completion and fail.
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
background_4xx_req.result(timeout=10)
background_4xx_req.result(timeout=10)
jcsp marked this conversation as resolved.
Show resolved Hide resolved
assert not hit_run_failpoint(log_cursor)

# Physical deletion should have happened
assert_prefix_empty(
Expand Down
Loading