Skip to content

Commit

Permalink
proxy: do not invalidate cache for permit errors (#7652)
Browse files Browse the repository at this point in the history
## Problem

If a permit cannot be acquired to connect to compute, the cache is
invalidated. This had the observed affect of sending more traffic to
ProxyWakeCompute on cplane.

## Summary of changes

Make sure that permit acquire failures are marked as "should not
invalidate cache".
  • Loading branch information
conradludgate committed May 8, 2024
1 parent 02d4286 commit e3a2631
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 18 deletions.
9 changes: 8 additions & 1 deletion proxy/src/compute.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
context::RequestMonitoring,
error::{ReportableError, UserFacingError},
metrics::{Metrics, NumDbConnectionsGuard},
Expand Down Expand Up @@ -34,6 +34,9 @@ pub enum ConnectionError {

#[error("{COULD_NOT_CONNECT}: {0}")]
WakeComputeError(#[from] WakeComputeError),

#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}

impl UserFacingError for ConnectionError {
Expand All @@ -57,6 +60,9 @@ impl UserFacingError for ConnectionError {
None => err.to_string(),
},
WakeComputeError(err) => err.to_string_client(),
TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
_ => COULD_NOT_CONNECT.to_owned(),
}
}
Expand All @@ -72,6 +78,7 @@ impl ReportableError for ConnectionError {
ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}
Expand Down
43 changes: 27 additions & 16 deletions proxy/src/console/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::{
compute,
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
context::RequestMonitoring,
error::ReportableError,
intern::ProjectIdInt,
metrics::ApiLockMetrics,
scram, EndpointCacheKey,
Expand All @@ -30,6 +31,8 @@ pub mod errors {
};
use thiserror::Error;

use super::ApiLockError;

/// A go-to error message which doesn't leak any detail.
const REQUEST_FAILED: &str = "Console request failed";

Expand Down Expand Up @@ -211,8 +214,8 @@ pub mod errors {
#[error("Too many connections attempts")]
TooManyConnections,

#[error("Timeout waiting to acquire wake compute lock")]
TimeoutError,
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}

// This allows more useful interactions than `#[from]`.
Expand All @@ -222,17 +225,6 @@ pub mod errors {
}
}

impl From<tokio::sync::AcquireError> for WakeComputeError {
fn from(_: tokio::sync::AcquireError) -> Self {
WakeComputeError::TimeoutError
}
}
impl From<tokio::time::error::Elapsed> for WakeComputeError {
fn from(_: tokio::time::error::Elapsed) -> Self {
WakeComputeError::TimeoutError
}
}

impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
use WakeComputeError::*;
Expand All @@ -245,7 +237,9 @@ pub mod errors {

TooManyConnections => self.to_string(),

TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
Expand All @@ -256,7 +250,7 @@ pub mod errors {
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
WakeComputeError::ApiError(e) => e.get_error_kind(),
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}
Expand Down Expand Up @@ -456,6 +450,23 @@ pub struct ApiLocks<K> {
metrics: &'static ApiLockMetrics,
}

#[derive(Debug, thiserror::Error)]
pub enum ApiLockError {
#[error("lock was closed")]
AcquireError(#[from] tokio::sync::AcquireError),
#[error("permit could not be acquired")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}

impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
}

impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
Expand All @@ -475,7 +486,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
})
}

pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });
}
Expand Down
2 changes: 2 additions & 0 deletions proxy/src/proxy/retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ impl ShouldRetry for compute::ConnectionError {
match self {
compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
// the cache entry was not checked for validity
compute::ConnectionError::TooManyConnectionAttempts(_) => false,
_ => true,
}
}
Expand Down
2 changes: 1 addition & 1 deletion proxy/src/proxy/wake_compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
WakeupFailureKind::ApiConsoleOtherError
}
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
};
Metrics::get()
.proxy
Expand Down
16 changes: 16 additions & 0 deletions proxy/src/serverless/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use crate::{
console::{
errors::{GetAuthInfoError, WakeComputeError},
locks::ApiLocks,
provider::ApiLockError,
CachedNodeInfo,
},
context::RequestMonitoring,
Expand Down Expand Up @@ -131,6 +132,8 @@ pub enum HttpConnError {
AuthError(#[from] AuthError),
#[error("wake_compute returned error")]
WakeCompute(#[from] WakeComputeError),
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}

impl ReportableError for HttpConnError {
Expand All @@ -141,6 +144,7 @@ impl ReportableError for HttpConnError {
HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
HttpConnError::AuthError(a) => a.get_error_kind(),
HttpConnError::WakeCompute(w) => w.get_error_kind(),
HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(),
}
}
}
Expand All @@ -153,6 +157,9 @@ impl UserFacingError for HttpConnError {
HttpConnError::GetAuthInfo(c) => c.to_string_client(),
HttpConnError::AuthError(c) => c.to_string_client(),
HttpConnError::WakeCompute(c) => c.to_string_client(),
HttpConnError::TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
Expand All @@ -165,6 +172,15 @@ impl ShouldRetry for HttpConnError {
HttpConnError::GetAuthInfo(_) => false,
HttpConnError::AuthError(_) => false,
HttpConnError::WakeCompute(_) => false,
HttpConnError::TooManyConnectionAttempts(_) => false,
}
}
fn should_retry_database_address(&self) -> bool {
match self {
HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
// we never checked cache validity
HttpConnError::TooManyConnectionAttempts(_) => false,
_ => true,
}
}
}
Expand Down

1 comment on commit e3a2631

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3105 tests run: 2958 passed, 1 failed, 146 skipped (full report)


Failures on Postgres 14

  • test_storage_controller_many_tenants[github-actions-selfhosted]: release
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_storage_controller_many_tenants[release-pg14-github-actions-selfhosted]"
Flaky tests (3)

Postgres 16

  • test_vm_bit_clear_on_heap_lock: debug

Postgres 15

  • test_partial_evict_tenant[relative_equal]: release

Postgres 14

Code coverage* (full report)

  • functions: 31.2% (6256 of 20059 functions)
  • lines: 46.6% (46934 of 100613 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
e3a2631 at 2024-05-08T12:10:02.806Z :recycle:

Please sign in to comment.