From e3a2631df9850d9eb3682b3a1765f93644425678 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 8 May 2024 11:33:41 +0100 Subject: [PATCH] proxy: do not invalidate cache for permit errors (#7652) ## Problem If a permit cannot be acquired to connect to compute, the cache is invalidated. This had the observed affect of sending more traffic to ProxyWakeCompute on cplane. ## Summary of changes Make sure that permit acquire failures are marked as "should not invalidate cache". --- proxy/src/compute.rs | 9 ++++++- proxy/src/console/provider.rs | 43 +++++++++++++++++++++------------ proxy/src/proxy/retry.rs | 2 ++ proxy/src/proxy/wake_compute.rs | 2 +- proxy/src/serverless/backend.rs | 16 ++++++++++++ 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 23266ac4effe..4433b3c1c2dd 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,7 +1,7 @@ use crate::{ auth::parse_endpoint_param, cancellation::CancelClosure, - console::{errors::WakeComputeError, messages::MetricsAuxInfo}, + console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, context::RequestMonitoring, error::{ReportableError, UserFacingError}, metrics::{Metrics, NumDbConnectionsGuard}, @@ -34,6 +34,9 @@ pub enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl UserFacingError for ConnectionError { @@ -57,6 +60,9 @@ impl UserFacingError for ConnectionError { None => err.to_string(), }, WakeComputeError(err) => err.to_string_client(), + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } _ => COULD_NOT_CONNECT.to_owned(), } } @@ -72,6 +78,7 @@ impl ReportableError for ConnectionError { ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), + ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index a05cf248f6d7..3b996cdbd11f 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -12,6 +12,7 @@ use crate::{ compute, config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, + error::ReportableError, intern::ProjectIdInt, metrics::ApiLockMetrics, scram, EndpointCacheKey, @@ -30,6 +31,8 @@ pub mod errors { }; use thiserror::Error; + use super::ApiLockError; + /// A go-to error message which doesn't leak any detail. const REQUEST_FAILED: &str = "Console request failed"; @@ -211,8 +214,8 @@ pub mod errors { #[error("Too many connections attempts")] TooManyConnections, - #[error("Timeout waiting to acquire wake compute lock")] - TimeoutError, + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } // This allows more useful interactions than `#[from]`. @@ -222,17 +225,6 @@ pub mod errors { } } - impl From for WakeComputeError { - fn from(_: tokio::sync::AcquireError) -> Self { - WakeComputeError::TimeoutError - } - } - impl From for WakeComputeError { - fn from(_: tokio::time::error::Elapsed) -> Self { - WakeComputeError::TimeoutError - } - } - impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { use WakeComputeError::*; @@ -245,7 +237,9 @@ pub mod errors { TooManyConnections => self.to_string(), - TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } } } } @@ -256,7 +250,7 @@ pub mod errors { WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, WakeComputeError::ApiError(e) => e.get_error_kind(), WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit, - WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit, + WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } @@ -456,6 +450,23 @@ pub struct ApiLocks { metrics: &'static ApiLockMetrics, } +#[derive(Debug, thiserror::Error)] +pub enum ApiLockError { + #[error("lock was closed")] + AcquireError(#[from] tokio::sync::AcquireError), + #[error("permit could not be acquired")] + TimeoutError(#[from] tokio::time::error::Elapsed), +} + +impl ReportableError for ApiLockError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service, + ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit, + } + } +} + impl ApiLocks { pub fn new( name: &'static str, @@ -475,7 +486,7 @@ impl ApiLocks { }) } - pub async fn get_permit(&self, key: &K) -> Result { + pub async fn get_permit(&self, key: &K) -> Result { if self.permits == 0 { return Ok(WakeComputePermit { permit: None }); } diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 36a05ba190e9..8dec1f1137a2 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -86,6 +86,8 @@ impl ShouldRetry for compute::ConnectionError { match self { compute::ConnectionError::Postgres(err) => err.should_retry_database_address(), compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(), + // the cache entry was not checked for validity + compute::ConnectionError::TooManyConnectionAttempts(_) => false, _ => true, } } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 3d9e94dd72dc..94b03e1ccc10 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { WakeupFailureKind::ApiConsoleOtherError } WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, - WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError, + WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, }; Metrics::get() .proxy diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 963913a260e1..ce58f575e2cf 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -10,6 +10,7 @@ use crate::{ console::{ errors::{GetAuthInfoError, WakeComputeError}, locks::ApiLocks, + provider::ApiLockError, CachedNodeInfo, }, context::RequestMonitoring, @@ -131,6 +132,8 @@ pub enum HttpConnError { AuthError(#[from] AuthError), #[error("wake_compute returned error")] WakeCompute(#[from] WakeComputeError), + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl ReportableError for HttpConnError { @@ -141,6 +144,7 @@ impl ReportableError for HttpConnError { HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), HttpConnError::WakeCompute(w) => w.get_error_kind(), + HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(), } } } @@ -153,6 +157,9 @@ impl UserFacingError for HttpConnError { HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), HttpConnError::WakeCompute(c) => c.to_string_client(), + HttpConnError::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } } } } @@ -165,6 +172,15 @@ impl ShouldRetry for HttpConnError { HttpConnError::GetAuthInfo(_) => false, HttpConnError::AuthError(_) => false, HttpConnError::WakeCompute(_) => false, + HttpConnError::TooManyConnectionAttempts(_) => false, + } + } + fn should_retry_database_address(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.should_retry_database_address(), + // we never checked cache validity + HttpConnError::TooManyConnectionAttempts(_) => false, + _ => true, } } }