-
Notifications
You must be signed in to change notification settings - Fork 434
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tokio-epoll-uring: retry on launch failures due to locked memory
refs #7136 Problem ------- Before this PR, we were using `tokio_epoll_uring::thread_local_system()`, which panics on tokio_epoll_uring::System::launch() failure As we've learned in [the past](#6373 (comment)), some older Linux kernels account io_uring instances as locked memory. And while we've raised the limit in prod considerably, we did hit it once on 2024-03-11 16:30 UTC. That was after we enabled tokio-epoll-uring fleet-wide, but before we had shipped release-5090 (c6ed86d) which did away with the last mass-creation of tokio-epoll-uring instances as per commit 3da410c Author: Christian Schwarz <christian@neon.tech> Date: Tue Mar 5 10:03:54 2024 +0100 tokio-epoll-uring: use it on the layer-creating code paths (#6378) Nonetheless, it highlighted that panicking in this situation is probably not ideal, as it can leave the pageserver process in a semi-broken state. Further, due to low sampling rate of Prometheus metrics, we don't know much about the circumstances of this failure instance. Solution -------- This PR implements a custom thread_local_system() that is pageserver-aware and will do the following on failure: - dump relevant stats to `tracing!`, hopefully they will be useful to understand the circumstances better - if it's the locked memory failure (or any other ENOMEM): abort() the process - if it's ENOMEM, retry with exponential back-off, capped at 3s. - add metric counters so we can create an alert This makes sense in the production environment where we know that _usually_, there's ample locked memory allowance available, and we know the failure rate is rare.
- Loading branch information
Showing
7 changed files
with
234 additions
and
12 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
195 changes: 195 additions & 0 deletions
195
pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific | ||
//! handling in case the instance can't launched. | ||
//! | ||
//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation | ||
//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. | ||
//! See https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 for more details. | ||
|
||
use std::sync::atomic::AtomicU32; | ||
use std::sync::Arc; | ||
|
||
use tokio_util::sync::CancellationToken; | ||
use tracing::{error, info, info_span, warn, Instrument}; | ||
use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; | ||
|
||
use tokio_epoll_uring::{System, SystemHandle}; | ||
|
||
use crate::virtual_file::on_fatal_io_error; | ||
|
||
use crate::metrics::tokio_epoll_uring as metrics; | ||
|
||
#[derive(Clone)] | ||
struct ThreadLocalState(Arc<ThreadLocalStateInner>); | ||
|
||
struct ThreadLocalStateInner { | ||
cell: tokio::sync::OnceCell<SystemHandle>, | ||
launch_attempts: AtomicU32, | ||
} | ||
|
||
impl ThreadLocalState { | ||
pub fn new() -> Self { | ||
Self(Arc::new(ThreadLocalStateInner { | ||
cell: tokio::sync::OnceCell::default(), | ||
launch_attempts: AtomicU32::new(0), | ||
})) | ||
} | ||
pub fn make_id_string(&self) -> String { | ||
format!("0x{:p}", Arc::as_ptr(&self.0)) | ||
} | ||
} | ||
|
||
impl Drop for ThreadLocalState { | ||
fn drop(&mut self) { | ||
info!(parent: None, id=%self.make_id_string(), "tokio-epoll-uring_ext: ThreadLocalState is being dropped and id might be re-used in the future"); | ||
} | ||
} | ||
|
||
thread_local! { | ||
static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new(); | ||
} | ||
|
||
/// Panics if we cannot [`System::launch`]. | ||
pub async fn thread_local_system() -> Handle { | ||
let fake_cancel = CancellationToken::new(); | ||
loop { | ||
let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone()); | ||
let inner = &thread_local_state.0; | ||
// NB: thread_id becomes stale after first await | ||
let get_or_init_res = inner | ||
.cell | ||
.get_or_try_init(|| async { | ||
let attempt_no = inner | ||
.launch_attempts | ||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed); | ||
let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no); | ||
async { | ||
// Rate-limit retries per thread-local. | ||
// NB: doesn't yield to executor at attempt_no=0. | ||
utils::backoff::exponential_backoff( | ||
attempt_no, | ||
DEFAULT_BASE_BACKOFF_SECONDS, | ||
DEFAULT_MAX_BACKOFF_SECONDS, | ||
&fake_cancel, | ||
) | ||
.await; | ||
let res = System::launch() | ||
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it | ||
.await; | ||
match res { | ||
Ok(system) => { | ||
info!("successfully launched system"); | ||
metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc(); | ||
Ok(system) | ||
} | ||
Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { | ||
warn!("not enough locked memory to tokio-epoll-uring, will retry"); | ||
info_span!("stats").in_scope(|| { | ||
emit_launch_failure_process_stats(); | ||
}); | ||
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc(); | ||
Err(()) | ||
} | ||
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere. | ||
// This is equivalent to a fatal IO error. | ||
Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => { | ||
error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process"); | ||
info_span!("stats").in_scope(|| { | ||
emit_launch_failure_process_stats(); | ||
}); | ||
on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring"); | ||
}, | ||
} | ||
} | ||
.instrument(span) | ||
.await | ||
}) | ||
.await; | ||
if get_or_init_res.is_ok() { | ||
return Handle(thread_local_state); | ||
} | ||
} | ||
} | ||
|
||
fn emit_launch_failure_process_stats() { | ||
// tokio-epoll-uring stats | ||
// vmlck + rlimit | ||
// number of threads | ||
// rss / system memory usage generally | ||
|
||
let tokio_epoll_uring::metrics::Metrics { | ||
systems_created, | ||
systems_destroyed, | ||
} = tokio_epoll_uring::metrics::global(); | ||
info!(systems_created, systems_destroyed, "tokio-epoll-uring"); | ||
|
||
match procfs::process::Process::myself() { | ||
Ok(myself) => { | ||
match myself.limits() { | ||
Ok(limits) => { | ||
info!(?limits.max_locked_memory, "/proc/self/limits"); | ||
} | ||
Err(error) => { | ||
info!(%error, "no limit stats due to error"); | ||
} | ||
} | ||
|
||
match myself.status() { | ||
Ok(status) => { | ||
let procfs::process::Status { | ||
vmsize, | ||
vmlck, | ||
vmpin, | ||
vmrss, | ||
rssanon, | ||
rssfile, | ||
rssshmem, | ||
vmdata, | ||
vmstk, | ||
vmexe, | ||
vmlib, | ||
vmpte, | ||
threads, | ||
.. | ||
} = status; | ||
info!( | ||
vmsize, | ||
vmlck, | ||
vmpin, | ||
vmrss, | ||
rssanon, | ||
rssfile, | ||
rssshmem, | ||
vmdata, | ||
vmstk, | ||
vmexe, | ||
vmlib, | ||
vmpte, | ||
threads, | ||
"/proc/self/status" | ||
); | ||
} | ||
Err(error) => { | ||
info!(%error, "no status status due to error"); | ||
} | ||
} | ||
} | ||
Err(error) => { | ||
info!(%error, "no process stats due to error"); | ||
} | ||
}; | ||
} | ||
|
||
#[derive(Clone)] | ||
pub struct Handle(ThreadLocalState); | ||
|
||
impl std::ops::Deref for Handle { | ||
type Target = SystemHandle; | ||
|
||
fn deref(&self) -> &Self::Target { | ||
self.0 | ||
.0 | ||
.cell | ||
.get() | ||
.expect("must be already initialized when using this") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters