-
Notifications
You must be signed in to change notification settings - Fork 434
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(per-tenant throttling): exclude throttled time from page_service…
… metrics + regression test (#6953) part of #5899 Problem ------- Before this PR, the time spent waiting on the throttle was charged towards the higher-level page_service metrics, i.e., `pageserver_smgr_query_seconds`. The metrics are the foundation of internal SLIs / SLOs. A throttled tenant would cause the SLI to degrade / SLO alerts to fire. Changes ------- - don't charge time spent in throttle towards the page_service metrics - record time spent in throttle in RequestContext and subtract it from the elapsed time - this works because the page_service path doesn't create child context, so, all the throttle time is recorded in the parent - it's quite brittle and will break if we ever decide to spawn child tasks that need child RequestContexts, which would have separate instances of the `micros_spent_throttled` counter. - however, let's punt that to a more general refactoring of RequestContext - add a test case that ensures that - throttling happens for getpage requests; this aspect of the test passed before this PR - throttling delays aren't charged towards the page_service metrics; this aspect of the test only passes with this PR - drive-by: make the throttle log message `info!`, it's an expected condition Performance ----------- I took the same measurements as in #6706 , no meaningful change in CPU overhead. Future Work ----------- This PR enables us to experiment with the throttle for select tenants without affecting the SLI metrics / triggering SLO alerts. Before declaring this feature done, we need more work to happen, specifically: - decide on whether we want to retain the flexibility of throttling any `Timeline::get` call, filtered by TaskKind - versus: separate throttles for each page_service endpoint, potentially with separate config options - the trouble here is that this decision implies changes to the TenantConfig, so, if we start using the current config style now, then decide to switch to a different config, it'll be a breaking change Nice-to-haves but probably not worth the time right now: - Equivalent tests to ensure the throttle applies to all other page_service handlers.
- Loading branch information
Showing
7 changed files
with
308 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
use std::{ | ||
sync::atomic::{AtomicU32, Ordering}, | ||
time::Duration, | ||
}; | ||
|
||
#[derive(Debug)] | ||
pub struct CounterU32 { | ||
inner: AtomicU32, | ||
} | ||
impl Default for CounterU32 { | ||
fn default() -> Self { | ||
Self { | ||
inner: AtomicU32::new(u32::MAX), | ||
} | ||
} | ||
} | ||
impl CounterU32 { | ||
pub fn open(&self) -> Result<(), &'static str> { | ||
match self | ||
.inner | ||
.compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) | ||
{ | ||
Ok(_) => Ok(()), | ||
Err(_) => Err("open() called on clsoed state"), | ||
} | ||
} | ||
pub fn close(&self) -> Result<u32, &'static str> { | ||
match self.inner.swap(u32::MAX, Ordering::Relaxed) { | ||
u32::MAX => Err("close() called on closed state"), | ||
x => Ok(x), | ||
} | ||
} | ||
|
||
pub fn add(&self, count: u32) -> Result<(), &'static str> { | ||
if count == 0 { | ||
return Ok(()); | ||
} | ||
let mut had_err = None; | ||
self.inner | ||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { | ||
u32::MAX => { | ||
had_err = Some("add() called on closed state"); | ||
None | ||
} | ||
x => { | ||
let (new, overflowed) = x.overflowing_add(count); | ||
if new == u32::MAX || overflowed { | ||
had_err = Some("add() overflowed the counter"); | ||
None | ||
} else { | ||
Some(new) | ||
} | ||
} | ||
}) | ||
.map_err(|_| had_err.expect("we set it whenever the function returns None")) | ||
.map(|_| ()) | ||
} | ||
} | ||
|
||
#[derive(Default, Debug)] | ||
pub struct MicroSecondsCounterU32 { | ||
inner: CounterU32, | ||
} | ||
|
||
impl MicroSecondsCounterU32 { | ||
pub fn open(&self) -> Result<(), &'static str> { | ||
self.inner.open() | ||
} | ||
pub fn add(&self, duration: Duration) -> Result<(), &'static str> { | ||
match duration.as_micros().try_into() { | ||
Ok(x) => self.inner.add(x), | ||
Err(_) => Err("add(): duration conversion error"), | ||
} | ||
} | ||
pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> { | ||
let val = self.inner.close()?; | ||
let val = Duration::from_micros(val as u64); | ||
let subbed = match from.checked_sub(val) { | ||
Some(v) => v, | ||
None => return Err("Duration::checked_sub"), | ||
}; | ||
Ok(subbed) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
|
||
use super::*; | ||
|
||
#[test] | ||
fn test_basic() { | ||
let counter = MicroSecondsCounterU32::default(); | ||
counter.open().unwrap(); | ||
counter.add(Duration::from_micros(23)).unwrap(); | ||
let res = counter | ||
.close_and_checked_sub_from(Duration::from_micros(42)) | ||
.unwrap(); | ||
assert_eq!(res, Duration::from_micros(42 - 23)); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
270d3be
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
2567 tests run: 2431 passed, 1 failed, 135 skipped (full report)
Failures on Postgres 16
test_forward_compatibility
: releaseTest coverage report is not available
270d3be at 2024-03-05T15:07:18.303Z :recycle: