Skip to content

Commit

Permalink
safekeeper: add eviction_min_resident to stop evictions thrashing (#8335
Browse files Browse the repository at this point in the history
)

## Problem

- The condition for eviction is not time-based: it is possible for a
timeline to be restored in response to a client, that client times out,
and then as soon as the timeline is restored it is immediately evicted
again.
- There is no delay on eviction at startup of the safekeeper, so when it
starts up and sees many idle timelines, it does many evictions which
will likely be immediately restored when someone uses the timeline.

## Summary of changes

- Add `eviction_min_resident` parameter, and use it in
`ready_for_eviction` to avoid evictions if the timeline has been
resident for less than this period.
- This also implicitly delays evictions at startup for
`eviction_min_resident`
- Set this to a very low number for the existing eviction test, which
expects immediate eviction.

The default period is 15 minutes. The general reasoning for that is that
in the worst case where we thrash ~10k timelines on one safekeeper,
downloading 16MB for each one, we should set a period that would not
overwhelm the node's bandwidth.
  • Loading branch information
jcsp committed Jul 10, 2024
1 parent 9f4511c commit 24f8133
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 14 deletions.
11 changes: 9 additions & 2 deletions safekeeper/src/bin/safekeeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ use utils::pid_file;

use metrics::set_build_info_metric;
use safekeeper::defaults::{
DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
};
use safekeeper::http;
Expand Down Expand Up @@ -194,6 +194,12 @@ struct Args {
/// Number of allowed concurrent uploads of partial segments to remote storage.
#[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
partial_backup_concurrency: usize,
/// How long a timeline must be resident before it is eligible for eviction.
/// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
/// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
/// if it weren't for `eviction_min_resident` preventing that.
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
eviction_min_resident: Duration,
}

// Like PathBufValueParser, but allows empty string.
Expand Down Expand Up @@ -348,6 +354,7 @@ async fn main() -> anyhow::Result<()> {
delete_offloaded_wal: args.delete_offloaded_wal,
control_file_save_interval: args.control_file_save_interval,
partial_backup_concurrency: args.partial_backup_concurrency,
eviction_min_resident: args.eviction_min_resident,
};

// initialize sentry if SENTRY_DSN is provided
Expand Down
7 changes: 7 additions & 0 deletions safekeeper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ pub mod defaults {
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";

// By default, our required residency before eviction is the same as the period that passes
// before uploading a partial segment, so that in normal operation the eviction can happen
// as soon as we have done the partial segment upload.
pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT;
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -93,6 +98,7 @@ pub struct SafeKeeperConf {
pub delete_offloaded_wal: bool,
pub control_file_save_interval: Duration,
pub partial_backup_concurrency: usize,
pub eviction_min_resident: Duration,
}

impl SafeKeeperConf {
Expand Down Expand Up @@ -136,6 +142,7 @@ impl SafeKeeperConf {
delete_offloaded_wal: false,
control_file_save_interval: Duration::from_secs(1),
partial_backup_concurrency: 1,
eviction_min_resident: Duration::ZERO,
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions safekeeper/src/timeline_eviction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use anyhow::Context;
use camino::Utf8PathBuf;
use remote_storage::RemotePath;
use std::time::Instant;
use tokio::{
fs::File,
io::{AsyncRead, AsyncWriteExt},
Expand Down Expand Up @@ -48,6 +49,7 @@ impl Manager {
.flush_lsn
.segment_number(self.wal_seg_size)
== self.last_removed_segno + 1
&& self.resident_since.elapsed() >= self.conf.eviction_min_resident
}

/// Evict the timeline to remote storage.
Expand Down Expand Up @@ -91,6 +93,8 @@ impl Manager {
return;
}

self.resident_since = Instant::now();

info!("successfully restored evicted timeline");
}
}
Expand Down
5 changes: 5 additions & 0 deletions safekeeper/src/timeline_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ pub(crate) struct Manager {
// misc
pub(crate) access_service: AccessService,
pub(crate) partial_backup_rate_limiter: RateLimiter,

// Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
// evict them if they go inactive very soon after being restored.
pub(crate) resident_since: std::time::Instant,
}

/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
Expand Down Expand Up @@ -350,6 +354,7 @@ impl Manager {
access_service: AccessService::new(manager_tx),
tli,
partial_backup_rate_limiter,
resident_since: std::time::Instant::now(),
}
}

Expand Down
1 change: 1 addition & 0 deletions safekeeper/tests/walproposer_sim/safekeeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
delete_offloaded_wal: false,
control_file_save_interval: Duration::from_secs(1),
partial_backup_concurrency: 1,
eviction_min_resident: Duration::ZERO,
};

let mut global = GlobalMap::new(disk, conf.clone())?;
Expand Down
21 changes: 19 additions & 2 deletions test_runner/fixtures/neon_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ def __init__(
pageserver_virtual_file_io_engine: Optional[str] = None,
pageserver_aux_file_policy: Optional[AuxFileStore] = None,
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
safekeeper_extra_opts: Optional[list[str]] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
Expand Down Expand Up @@ -557,6 +558,8 @@ def __init__(

self.pageserver_aux_file_policy = pageserver_aux_file_policy

self.safekeeper_extra_opts = safekeeper_extra_opts

assert test_name.startswith(
"test_"
), "Unexpectedly instantiated from outside a test function"
Expand Down Expand Up @@ -1193,7 +1196,9 @@ def __init__(self, config: NeonEnvBuilder):
sk_cfg[
"remote_storage"
] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
self.safekeepers.append(
Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
)
cfg["safekeepers"].append(sk_cfg)

log.info(f"Config: {cfg}")
Expand Down Expand Up @@ -4016,16 +4021,28 @@ class Safekeeper(LogUtils):
id: int
running: bool = False

def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
def __init__(
self,
env: NeonEnv,
port: SafekeeperPort,
id: int,
running: bool = False,
extra_opts: Optional[List[str]] = None,
):
self.env = env
self.port = port
self.id = id
self.running = running
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
self.extra_opts = extra_opts

def start(
self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
) -> "Safekeeper":
if extra_opts is None:
# Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
extra_opts = self.extra_opts

assert self.running is False
self.env.neon_cli.safekeeper_start(
self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
Expand Down
21 changes: 11 additions & 10 deletions test_runner/regress/test_wal_acceptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2191,24 +2191,25 @@ def test_s3_eviction(
):
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_timeout": "100ms",
}
)

extra_opts = [
neon_env_builder.safekeeper_extra_opts = [
"--enable-offload",
"--partial-backup-timeout",
"50ms",
"--control-file-save-interval",
"1s",
# Safekeepers usually wait a while before evicting something: for this test we want them to
# evict things as soon as they are inactive.
"--eviction-min-resident=100ms",
]
if delete_offloaded_wal:
extra_opts.append("--delete-offloaded-wal")
neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")

for sk in env.safekeepers:
sk.stop().start(extra_opts=extra_opts)
env = neon_env_builder.init_start(
initial_tenant_conf={
"checkpoint_timeout": "100ms",
}
)

n_timelines = 5

Expand Down Expand Up @@ -2263,7 +2264,7 @@ def test_s3_eviction(
# restarting random safekeepers
for sk in env.safekeepers:
if random.random() < restart_chance:
sk.stop().start(extra_opts=extra_opts)
sk.stop().start()
time.sleep(0.5)

# require at least one successful eviction in at least one safekeeper
Expand Down

1 comment on commit 24f8133

@github-actions
Copy link

@github-actions github-actions bot commented on 24f8133 Jul 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3154 tests run: 3027 passed, 0 failed, 127 skipped (full report)


Flaky tests (2)

Postgres 14

  • test_compute_pageserver_connection_stress: release
  • test_basebackup_with_high_slru_count[github-actions-selfhosted-sequential-10-13-30]: release

Code coverage* (full report)

  • functions: 32.7% (6972 of 21335 functions)
  • lines: 50.1% (54801 of 109455 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
24f8133 at 2024-07-11T07:21:07.686Z :recycle:

Please sign in to comment.