Skip to content

Commit

Permalink
pageserver: avoid spurious "bad state" logs/errors during shutdown (#…
Browse files Browse the repository at this point in the history
…7912)

## Problem

- Initial size calculations tend to fail with `Bad state (not active)`

Closes: #7911

## Summary of changes

- In `wait_lsn`, return WaitLsnError::Cancelled rather than BadState
when the state is Stopping
- Replace PageReconstructError's `Other` variant with a specific
`BadState` variant
- Avoid returning anyhow::Error from get_ready_ancestor_timeline -- this
was only used for the case where there was no ancestor. All callers of
this function had implicitly checked that the ancestor timeline exists
before calling it, so they can pass in the ancestor instead of handling
an error.
  • Loading branch information
jcsp committed May 31, 2024
1 parent 7ec70b5 commit 5a394fd
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 51 deletions.
4 changes: 2 additions & 2 deletions pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ impl From<WaitLsnError> for PageStreamError {
match value {
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()),
}
}
}
Expand All @@ -383,7 +383,7 @@ impl From<WaitLsnError> for QueryError {
match value {
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState => Self::Reconnect,
WaitLsnError::BadState { .. } => Self::Reconnect,
}
}
}
Expand Down
9 changes: 5 additions & 4 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1507,7 +1507,7 @@ impl Tenant {
.wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
}
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
Expand Down Expand Up @@ -4308,9 +4308,10 @@ mod tests {

// This needs to traverse to the parent, and fails.
let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
assert!(err
.to_string()
.contains("will not become active. Current state: Broken"));
assert!(err.to_string().starts_with(&format!(
"Bad state on timeline {}: Broken",
tline.timeline_id
)));

Ok(())
}
Expand Down
89 changes: 44 additions & 45 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ pub(crate) enum PageReconstructError {
Other(#[from] anyhow::Error),

#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
AncestorLsnTimeout(WaitLsnError),

#[error("timeline shutting down")]
Cancelled,
Expand Down Expand Up @@ -651,11 +651,14 @@ pub(crate) enum GetReadyAncestorError {
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),

#[error("Bad state on timeline {timeline_id}: {state:?}")]
BadState {
timeline_id: TimelineId,
state: TimelineState,
},

#[error("Cancelled")]
Cancelled,

#[error(transparent)]
Other(#[from] anyhow::Error),
}

#[derive(Clone, Copy)]
Expand Down Expand Up @@ -690,8 +693,8 @@ pub(crate) enum WaitLsnError {
Shutdown,

// Called on an timeline not in active state or shutting down
#[error("Bad state (not active)")]
BadState,
#[error("Bad timeline state: {0:?}")]
BadState(TimelineState),

// Timeout expired while waiting for LSN to catch up with goal.
#[error("{0}")]
Expand Down Expand Up @@ -756,8 +759,8 @@ impl From<GetReadyAncestorError> for PageReconstructError {
match e {
AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)),
Cancelled => PageReconstructError::Cancelled,
Other(other) => PageReconstructError::Other(other),
}
}
}
Expand Down Expand Up @@ -1466,10 +1469,11 @@ impl Timeline {
who_is_waiting: WaitLsnWaiter<'_>,
ctx: &RequestContext, /* Prepare for use by cancellation */
) -> Result<(), WaitLsnError> {
if self.cancel.is_cancelled() {
let state = self.current_state();
if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) {
return Err(WaitLsnError::Shutdown);
} else if !self.is_active() {
return Err(WaitLsnError::BadState);
} else if !matches!(state, TimelineState::Active) {
return Err(WaitLsnError::BadState(state));
}

if cfg!(debug_assertions) {
Expand Down Expand Up @@ -3193,17 +3197,21 @@ impl Timeline {
}

// Recurse into ancestor if needed
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
cont_lsn
);
if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
cont_lsn
);

timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
timeline = &*timeline_owned;
prev_lsn = None;
continue 'outer;
timeline_owned = timeline
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
.await?;
timeline = &*timeline_owned;
prev_lsn = None;
continue 'outer;
}
}

let guard = timeline.layers.read().await;
Expand Down Expand Up @@ -3352,10 +3360,10 @@ impl Timeline {
break None;
}

// Not fully retrieved but no ancestor timeline.
if timeline.ancestor_timeline.is_none() {
let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
// Not fully retrieved but no ancestor timeline.
break Some(keyspace);
}
};

// Now we see if there are keys covered by the image layer but does not exist in the
// image layer, which means that the key does not exist.
Expand All @@ -3375,7 +3383,7 @@ impl Timeline {
// Take the min to avoid reconstructing a page with data newer than request Lsn.
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
timeline_owned = timeline
.get_ready_ancestor_timeline(ctx)
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
.await
.map_err(GetVectoredError::GetReadyAncestorError)?;
timeline = &*timeline_owned;
Expand Down Expand Up @@ -3547,13 +3555,9 @@ impl Timeline {

async fn get_ready_ancestor_timeline(
&self,
ancestor: &Arc<Timeline>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetReadyAncestorError> {
let ancestor = match self.get_ancestor_timeline() {
Ok(timeline) => timeline,
Err(e) => return Err(GetReadyAncestorError::from(e)),
};

// It's possible that the ancestor timeline isn't active yet, or
// is active but hasn't yet caught up to the branch point. Wait
// for it.
Expand Down Expand Up @@ -3586,11 +3590,10 @@ impl Timeline {
));
}
Err(state) => {
return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
"Timeline {} will not become active. Current state: {:?}",
ancestor.timeline_id,
&state,
)));
return Err(GetReadyAncestorError::BadState {
timeline_id: ancestor.timeline_id,
state,
});
}
}
ancestor
Expand All @@ -3599,21 +3602,17 @@ impl Timeline {
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
WaitLsnError::BadState(state) => GetReadyAncestorError::BadState {
timeline_id: ancestor.timeline_id,
state,
},
})?;

Ok(ancestor)
Ok(ancestor.clone())
}

pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
format!(
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
self.timeline_id,
self.get_ancestor_timeline_id(),
)
})?;
Ok(Arc::clone(ancestor))
pub(crate) fn get_ancestor_timeline(&self) -> Option<Arc<Timeline>> {
self.ancestor_timeline.clone()
}

pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
Expand Down

1 comment on commit 5a394fd

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3244 tests run: 3088 passed, 4 failed, 152 skipped (full report)


Failures on Postgres 14

  • test_heavy_write_workload[neon_on-github-actions-selfhosted-10-5-5]: release
  • test_sharding_autosplit[github-actions-selfhosted]: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-sequential-10-13-30]: release
  • test_download_churn[github-actions-selfhosted-100-std-fs-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_heavy_write_workload[neon_on-release-pg14-github-actions-selfhosted-10-5-5] or test_sharding_autosplit[release-pg14-github-actions-selfhosted] or test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-sequential-10-13-30] or test_download_churn[release-pg14-github-actions-selfhosted-100-std-fs-30]"

Code coverage* (full report)

  • functions: 31.4% (6491 of 20678 functions)
  • lines: 48.4% (50220 of 103796 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
5a394fd at 2024-05-31T13:49:45.890Z :recycle:

Please sign in to comment.