Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pageserver): use leases to temporarily block gc #8084

Merged
merged 8 commits into from
Jun 18, 2024
8 changes: 8 additions & 0 deletions control_plane/src/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,10 @@ impl PageServerNode {
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(|x| x.to_string()),
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
Expand Down Expand Up @@ -506,6 +510,10 @@ impl PageServerNode {
.map(|x| x.parse::<AuxFilePolicy>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(|x| x.to_string()),
}
};

Expand Down
10 changes: 2 additions & 8 deletions libs/pageserver_api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,20 +179,12 @@ serde_with::serde_conv!(

impl LsnLease {
/// The default length for an explicit LSN lease request (10 minutes).
#[cfg(not(feature = "testing"))]
pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);

#[cfg(feature = "testing")]
pub const DEFAULT_LENGTH: Duration = Duration::from_secs(2);

/// The default length for an implicit LSN lease granted during
/// `get_lsn_by_timestamp` request (1 minutes).
#[cfg(not(feature = "testing"))]
pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);

#[cfg(feature = "testing")]
pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(2);

/// Checks whether the lease is expired.
pub fn is_expired(&self, now: &SystemTime) -> bool {
now > &self.valid_until
Expand Down Expand Up @@ -344,6 +336,8 @@ pub struct TenantConfig {
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_aux_file_policy: Option<AuxFilePolicy>,
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
}

/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
Expand Down
3 changes: 1 addition & 2 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LsnLease;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigResponse;
Expand Down Expand Up @@ -1731,7 +1730,7 @@ async fn lsn_lease_handler(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline
.make_lsn_lease(lsn, LsnLease::DEFAULT_LENGTH, &ctx)
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;

json_response(StatusCode::OK, result)
Expand Down
3 changes: 1 addition & 2 deletions pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ use futures::stream::FuturesUnordered;
use futures::Stream;
use futures::StreamExt;
use pageserver_api::key::Key;
use pageserver_api::models::LsnLease;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
Expand Down Expand Up @@ -936,7 +935,7 @@ impl PageServerHandler {
let timeline = self
.get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
.await?;
let lease = timeline.make_lsn_lease(lsn, LsnLease::DEFAULT_LENGTH, ctx)?;
let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
let valid_until = lease
.valid_until
.duration_since(SystemTime::UNIX_EPOCH)
Expand Down
25 changes: 19 additions & 6 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2429,6 +2429,13 @@ impl Tenant {
}
}

pub fn get_lsn_lease_length(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
.lsn_lease_length
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
}

pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
// Use read-copy-update in order to avoid overwriting the location config
// state if this races with [`Tenant::set_new_location_config`]. Note that
Expand Down Expand Up @@ -3835,6 +3842,8 @@ pub(crate) mod harness {
tenant_conf.image_layer_creation_check_threshold,
),
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
}
}
}
Expand Down Expand Up @@ -4052,7 +4061,7 @@ mod tests {
use itertools::Itertools;
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
use rand::{thread_rng, Rng};
use storage_layer::PersistentLayerKey;
use tests::storage_layer::ValuesReconstructState;
Expand Down Expand Up @@ -6973,7 +6982,7 @@ mod tests {
let leased_lsns = [0x30, 0x50, 0x70];
let mut leases = Vec::new();
let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
leases.push(timeline.make_lsn_lease(Lsn(*n), LsnLease::DEFAULT_LENGTH, &ctx)?);
leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
Ok(())
});

Expand All @@ -6983,8 +6992,11 @@ mod tests {
assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);

// Renewing with a long lease should renew lease with later expiration time.
let updated_lease_1 =
timeline.make_lsn_lease(Lsn(leased_lsns[1]), LsnLease::DEFAULT_LENGTH * 2, &ctx)?;
let updated_lease_1 = timeline.make_lsn_lease(
Lsn(leased_lsns[1]),
timeline.get_lsn_lease_length() * 2,
&ctx,
)?;

assert!(updated_lease_1.valid_until > leases[1].valid_until);

Expand Down Expand Up @@ -7017,12 +7029,13 @@ mod tests {
// Make lease on a already GC-ed LSN.
// 0/80 does not have a valid lease + is below latest_gc_cutoff
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
let res = timeline.make_lsn_lease(Lsn(0x80), LsnLease::DEFAULT_LENGTH, &ctx);
let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
assert!(res.is_err());

// Should still be able to renew a currently valid lease
// Assumption: original lease to is still valid for 0/50.
let _ = timeline.make_lsn_lease(Lsn(leased_lsns[1]), LsnLease::DEFAULT_LENGTH, &ctx)?;
let _ =
timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;

Ok(())
}
Expand Down
31 changes: 31 additions & 0 deletions pageserver/src/tenant/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithm;
use pageserver_api::models::CompactionAlgorithmSettings;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::LsnLease;
use pageserver_api::models::{self, ThrottleConfig};
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
Expand Down Expand Up @@ -377,6 +378,16 @@ pub struct TenantConf {
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
/// file is written.
pub switch_aux_file_policy: AuxFilePolicy,

/// The length for an explicit LSN lease request.
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
#[serde(with = "humantime_serde")]
pub lsn_lease_length: Duration,

/// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
#[serde(with = "humantime_serde")]
pub lsn_lease_length_for_ts: Duration,
}

/// Same as TenantConf, but this struct preserves the information about
Expand Down Expand Up @@ -476,6 +487,16 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub switch_aux_file_policy: Option<AuxFilePolicy>,

#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub lsn_lease_length: Option<Duration>,

#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub lsn_lease_length_for_ts: Option<Duration>,
}

impl TenantConfOpt {
Expand Down Expand Up @@ -538,6 +559,12 @@ impl TenantConfOpt {
switch_aux_file_policy: self
.switch_aux_file_policy
.unwrap_or(global_conf.switch_aux_file_policy),
lsn_lease_length: self
.lsn_lease_length
.unwrap_or(global_conf.lsn_lease_length),
lsn_lease_length_for_ts: self
.lsn_lease_length_for_ts
.unwrap_or(global_conf.lsn_lease_length_for_ts),
}
}
}
Expand Down Expand Up @@ -582,6 +609,8 @@ impl Default for TenantConf {
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
}
}
}
Expand Down Expand Up @@ -657,6 +686,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
switch_aux_file_policy: value.switch_aux_file_policy,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
}
}
}
Expand Down
11 changes: 7 additions & 4 deletions pageserver/src/tenant/tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use pageserver_api::models::LsnLease;
use rand::Rng;
use tokio_util::sync::CancellationToken;
use tracing::*;
Expand Down Expand Up @@ -365,7 +364,10 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
if first {
first = false;

if delay_by_default_lease_length(&cancel).await.is_err() {
if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
.await
.is_err()
{
break;
}

Expand Down Expand Up @@ -543,10 +545,11 @@ pub(crate) async fn random_init_delay(
/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
/// length, we gurantees that all the leases we granted before the restart will expire
/// when we run GC for the first time after the restart.
pub(crate) async fn delay_by_default_lease_length(
pub(crate) async fn delay_by_lease_length(
length: Duration,
cancel: &CancellationToken,
) -> Result<(), Cancelled> {
match tokio::time::timeout(LsnLease::DEFAULT_LENGTH, cancel.cancelled()).await {
match tokio::time::timeout(length, cancel.cancelled()).await {
Ok(_) => Err(Cancelled),
Err(_) => Ok(()),
}
Expand Down
18 changes: 18 additions & 0 deletions pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2117,6 +2117,24 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;

// Private functions
impl Timeline {
pub(crate) fn get_lsn_lease_length(&self) -> Duration {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.lsn_lease_length
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
}

// TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
#[allow(unused)]
pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.lsn_lease_length
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
yliang412 marked this conversation as resolved.
Show resolved Hide resolved
}

pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
let tenant_conf = self.tenant_conf.load();
tenant_conf
Expand Down
Loading