Skip to content

Commit

Permalink
pageserver: add GcError type (#7917)
Browse files Browse the repository at this point in the history
## Problem

- Because GC exposes all errors as an anyhow::Error, we have
intermittent issues with spurious log errors during shutdown, e.g. in
this failure of a performance test
https://neon-github-public-dev.s3.amazonaws.com/reports/main/9300804302/index.html#suites/07874de07c4a1c9effe0d92da7755ebf/214a2154f6f0217a/

```
Gc failed 1 times, retrying in 2s: shutting down
```

GC really doesn't do a lot of complicated IO: it doesn't benefit from
the backtrace capabilities of anyhow::Error, and can be expressed more
robustly as an enum.

## Summary of changes

- Add GcError type and use it instead of anyhow::Error in GC functions
- In `gc_iteration_internal`, return GcError::Cancelled on shutdown
rather than Ok(()) (we only used Ok before because we didn't have a
clear cancellation error variant to use).
- In `gc_iteration_internal`, skip past timelines that are shutting
down, to avoid having to go through another GC iteration if we happen to
see a deleting timeline during a GC run.
- In `refresh_gc_info_internal`, avoid an error case where a timeline
might not be found after being looked up, by carrying an Arc<Timeline>
instead of a TimelineId between the first loop and second loop in the
function.
- In HTTP request handler, handle Cancelled variants as 503 instead of
turning all GC errors into 500s.
  • Loading branch information
jcsp committed May 31, 2024
1 parent ef83f31 commit 7e60563
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 77 deletions.
112 changes: 65 additions & 47 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,33 @@ enum CreateTimelineCause {
Delete,
}

#[derive(thiserror::Error, Debug)]
pub(crate) enum GcError {
// The tenant is shutting down
#[error("tenant shutting down")]
TenantCancelled,

// The tenant is shutting down
#[error("timeline shutting down")]
TimelineCancelled,

// The tenant is in a state inelegible to run GC
#[error("not active")]
NotActive,

// A requested GC cutoff LSN was invalid, for example it tried to move backwards
#[error("not active")]
BadLsn { why: String },

// A remote storage error while scheduling updates after compaction
#[error(transparent)]
Remote(anyhow::Error),

// If GC was invoked for a particular timeline, this error means it didn't exist
#[error("timeline not found")]
TimelineNotFound,
}

impl Tenant {
/// Yet another helper for timeline initialization.
///
Expand Down Expand Up @@ -1605,24 +1632,23 @@ impl Tenant {
/// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
/// requires more history to be retained.
//
pub async fn gc_iteration(
pub(crate) async fn gc_iteration(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<GcResult> {
) -> Result<GcResult, GcError> {
// Don't start doing work during shutdown
if let TenantState::Stopping { .. } = self.current_state() {
return Ok(GcResult::default());
}

// there is a global allowed_error for this
anyhow::ensure!(
self.is_active(),
"Cannot run GC iteration on inactive tenant"
);
if !self.is_active() {
return Err(GcError::NotActive);
}

{
let conf = self.tenant_conf.load();
Expand Down Expand Up @@ -2790,28 +2816,13 @@ impl Tenant {
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<GcResult> {
) -> Result<GcResult, GcError> {
let mut totals: GcResult = Default::default();
let now = Instant::now();

let gc_timelines = match self
let gc_timelines = self
.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
.await
{
Ok(result) => result,
Err(e) => {
if let Some(PageReconstructError::Cancelled) =
e.downcast_ref::<PageReconstructError>()
{
// Handle cancellation
totals.elapsed = now.elapsed();
return Ok(totals);
} else {
// Propagate other errors
return Err(e);
}
}
};
.await?;

failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");

Expand All @@ -2836,7 +2847,19 @@ impl Tenant {
// made.
break;
}
let result = timeline.gc().await?;
let result = match timeline.gc().await {
Err(GcError::TimelineCancelled) => {
if target_timeline_id.is_some() {
// If we were targetting this specific timeline, surface cancellation to caller
return Err(GcError::TimelineCancelled);
} else {
// A timeline may be shutting down independently of the tenant's lifecycle: we should
// skip past this and proceed to try GC on other timelines.
continue;
}
}
r => r?,
};
totals += result;
}

Expand All @@ -2849,11 +2872,11 @@ impl Tenant {
/// [`Tenant::get_gc_horizon`].
///
/// This is usually executed as part of periodic gc, but can now be triggered more often.
pub async fn refresh_gc_info(
pub(crate) async fn refresh_gc_info(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<Vec<Arc<Timeline>>> {
) -> Result<Vec<Arc<Timeline>>, GcError> {
// since this method can now be called at different rates than the configured gc loop, it
// might be that these configuration values get applied faster than what it was previously,
// since these were only read from the gc task.
Expand All @@ -2874,7 +2897,7 @@ impl Tenant {
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<Vec<Arc<Timeline>>> {
) -> Result<Vec<Arc<Timeline>>, GcError> {
// before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
// currently visible timelines.
let timelines = self
Expand Down Expand Up @@ -2911,8 +2934,8 @@ impl Tenant {
}
}

if !self.is_active() {
anyhow::bail!("shutting down");
if !self.is_active() || self.cancel.is_cancelled() {
return Err(GcError::TenantCancelled);
}

// grab mutex to prevent new timelines from being created here; avoid doing long operations
Expand All @@ -2921,19 +2944,19 @@ impl Tenant {

// Scan all timelines. For each timeline, remember the timeline ID and
// the branch point where it was created.
let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = {
let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = {
let timelines = self.timelines.lock().unwrap();
let mut all_branchpoints = BTreeSet::new();
let timeline_ids = {
let timelines = {
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
if timelines.get(target_timeline_id).is_none() {
bail!("gc target timeline does not exist")
return Err(GcError::TimelineNotFound);
}
};

timelines
.iter()
.map(|(timeline_id, timeline_entry)| {
.map(|(_timeline_id, timeline_entry)| {
if let Some(ancestor_timeline_id) =
&timeline_entry.get_ancestor_timeline_id()
{
Expand All @@ -2955,41 +2978,36 @@ impl Tenant {
}
}

*timeline_id
timeline_entry.clone()
})
.collect::<Vec<_>>()
};
(all_branchpoints, timeline_ids)
(all_branchpoints, timelines)
};

// Ok, we now know all the branch points.
// Update the GC information for each timeline.
let mut gc_timelines = Vec::with_capacity(timeline_ids.len());
for timeline_id in timeline_ids {
// Timeline is known to be local and loaded.
let timeline = self
.get_timeline(timeline_id, false)
.with_context(|| format!("Timeline {timeline_id} was not found"))?;

let mut gc_timelines = Vec::with_capacity(timelines.len());
for timeline in timelines {
// If target_timeline is specified, ignore all other timelines
if let Some(target_timeline_id) = target_timeline_id {
if timeline_id != target_timeline_id {
if timeline.timeline_id != target_timeline_id {
continue;
}
}

let branchpoints: Vec<Lsn> = all_branchpoints
.range((
Included((timeline_id, Lsn(0))),
Included((timeline_id, Lsn(u64::MAX))),
Included((timeline.timeline_id, Lsn(0))),
Included((timeline.timeline_id, Lsn(u64::MAX))),
))
.map(|&x| x.1)
.collect();

{
let mut target = timeline.gc_info.write().unwrap();

match gc_cutoffs.remove(&timeline_id) {
match gc_cutoffs.remove(&timeline.timeline_id) {
Some(cutoffs) => {
*target = GcInfo {
retain_lsns: branchpoints,
Expand Down
10 changes: 8 additions & 2 deletions pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

use utils::crashsafe::path_with_suffix_extension;
Expand Down Expand Up @@ -2833,7 +2833,13 @@ pub(crate) async fn immediate_gc(
}
}

result.map_err(ApiError::InternalServerError)
result.map_err(|e| match e {
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
GcError::TimelineNotFound => {
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
}
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})
}

#[cfg(test)]
Expand Down
33 changes: 20 additions & 13 deletions pageserver/src/tenant/tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,21 +380,28 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let res = tenant
.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
.await;
if let Err(e) = res {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);
error!(
match res {
Ok(_) => {
error_run_count = 0;
period
}
Err(crate::tenant::GcError::TenantCancelled) => {
return;
}
Err(e) => {
let wait_duration = backoff::exponential_backoff_duration_seconds(
error_run_count + 1,
1.0,
MAX_BACKOFF_SECS,
);
error_run_count += 1;
let wait_duration = Duration::from_secs_f64(wait_duration);

error!(
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
);
wait_duration
} else {
error_run_count = 0;
period
wait_duration
}
}
};

Expand Down
47 changes: 34 additions & 13 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,14 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};

use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
use super::{
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
GcError,
};

#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub(crate) enum FlushLoopState {
Expand Down Expand Up @@ -4837,7 +4840,7 @@ impl Timeline {
/// Currently, we don't make any attempt at removing unneeded page versions
/// within a layer file. We can only remove the whole file if it's fully
/// obsolete.
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
pub(super) async fn gc(&self) -> Result<GcResult, GcError> {
// this is most likely the background tasks, but it might be the spawned task from
// immediate_gc
let _g = tokio::select! {
Expand All @@ -4850,7 +4853,7 @@ impl Timeline {

// Is the timeline being deleted?
if self.is_stopping() {
anyhow::bail!("timeline is Stopping");
return Err(GcError::TimelineCancelled);
}

let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
Expand Down Expand Up @@ -4908,7 +4911,7 @@ impl Timeline {
pitr_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
new_gc_cutoff: Lsn,
) -> anyhow::Result<GcResult> {
) -> Result<GcResult, GcError> {
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc

let now = SystemTime::now();
Expand All @@ -4930,12 +4933,15 @@ impl Timeline {
// The GC cutoff should only ever move forwards.
let waitlist = {
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
ensure!(
*write_guard <= new_gc_cutoff,
"Cannot move GC cutoff LSN backwards (was {}, new {})",
*write_guard,
new_gc_cutoff
);
if *write_guard > new_gc_cutoff {
return Err(GcError::BadLsn {
why: format!(
"Cannot move GC cutoff LSN backwards (was {}, new {})",
*write_guard, new_gc_cutoff
),
});
}

write_guard.store_and_unlock(new_gc_cutoff)
};
waitlist.wait().await;
Expand Down Expand Up @@ -5044,7 +5050,14 @@ impl Timeline {
// This unconditionally schedules also an index_part.json update, even though, we will
// be doing one a bit later with the unlinked gc'd layers.
let disk_consistent_lsn = self.disk_consistent_lsn.load();
self.schedule_uploads(disk_consistent_lsn, None)?;
self.schedule_uploads(disk_consistent_lsn, None)
.map_err(|e| {
if self.cancel.is_cancelled() {
GcError::TimelineCancelled
} else {
GcError::Remote(e)
}
})?;

let gc_layers = layers_to_remove
.iter()
Expand All @@ -5053,7 +5066,15 @@ impl Timeline {

result.layers_removed = gc_layers.len() as u64;

self.remote_client.schedule_gc_update(&gc_layers)?;
self.remote_client
.schedule_gc_update(&gc_layers)
.map_err(|e| {
if self.cancel.is_cancelled() {
GcError::TimelineCancelled
} else {
GcError::Remote(e)
}
})?;

guard.finish_gc_timeline(&gc_layers);

Expand All @@ -5068,7 +5089,7 @@ impl Timeline {
result.layers_removed, new_gc_cutoff
);

result.elapsed = now.elapsed()?;
result.elapsed = now.elapsed().unwrap_or(Duration::ZERO);
Ok(result)
}

Expand Down
Loading

1 comment on commit 7e60563

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3244 tests run: 3089 passed, 3 failed, 152 skipped (full report)


Failures on Postgres 14

  • test_sharding_autosplit[github-actions-selfhosted]: release
  • test_total_size_limit: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-vectored-10-13-30]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharding_autosplit[release-pg14-github-actions-selfhosted] or test_total_size_limit[release-pg14] or test_basebackup_with_high_slru_count[release-pg14-github-actions-selfhosted-vectored-10-13-30]"
Flaky tests (4)

Postgres 15

  • test_pageserver_restarts_under_worload: release

Postgres 14

  • test_download_remote_layers_api: release
  • test_secondary_background_downloads: debug
  • test_vm_bit_clear_on_heap_lock: debug

Test coverage report is not available

The comment gets automatically updated with the latest test results
7e60563 at 2024-05-31T22:41:50.787Z :recycle:

Please sign in to comment.