From 5b871802fd86c7b81fff0a99df3f1699ec8474b7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 26 Jun 2024 19:53:03 +0300 Subject: [PATCH] Add counters for commands processed through the libpq page service API (#8089) I was looking for metrics on how many computes are still using protocol version 1 and 2. This provides counters for that as "pagestream" and "pagestream_v2" commands, but also all the other commands. The new metrics are global for the whole pageserver instance rather than per-tenant, so the additional metrics bloat should be fairly small. --- pageserver/src/metrics.rs | 41 ++++++++++++++++++++++++++++++++++ pageserver/src/page_service.rs | 39 +++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c6b160733167..ca697afcf640 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1445,6 +1445,46 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)] +pub(crate) enum ComputeCommandKind { + PageStreamV2, + PageStream, + Basebackup, + GetLastRecordRlsn, + Fullbackup, + ImportBasebackup, + ImportWal, + LeaseLsn, + Show, +} + +pub(crate) struct ComputeCommandCounters { + map: EnumMap, +} + +pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy = Lazy::new(|| { + let inner = register_int_counter_vec!( + "pageserver_compute_commands", + "Number of compute -> pageserver commands processed", + &["command"] + ) + .expect("failed to define a metric"); + + ComputeCommandCounters { + map: EnumMap::from_array(std::array::from_fn(|i| { + let command = ::from_usize(i); + let command_str: &'static str = command.into(); + inner.with_label_values(&[command_str]) + })), + } +}); + +impl ComputeCommandCounters { + pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter { + &self.map[command] + } +} + // remote storage metrics static REMOTE_TIMELINE_CLIENT_CALLS: Lazy = Lazy::new(|| { @@ -2949,4 +2989,5 @@ pub fn preinitialize_metrics() { Lazy::force(&RECONSTRUCT_TIME); Lazy::force(&tenant_throttling::TIMELINE_GET); Lazy::force(&BASEBACKUP_QUERY_TIME); + Lazy::force(&COMPUTE_COMMANDS_COUNTERS); } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index ebc23e89458e..6ea5f396d0a7 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; -use crate::metrics::LIVE_CONNECTIONS_COUNT; +use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; @@ -1554,6 +1554,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::PageStreamV2) + .inc(); + self.handle_pagerequests( pgb, tenant_id, @@ -1579,6 +1583,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::PageStream) + .inc(); + self.handle_pagerequests( pgb, tenant_id, @@ -1605,6 +1613,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Basebackup) + .inc(); + let lsn = if let Some(lsn_str) = params.get(2) { Some( Lsn::from_str(lsn_str) @@ -1662,6 +1674,11 @@ where .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; + + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::GetLastRecordRlsn) + .inc(); + async { let timeline = self .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) @@ -1723,6 +1740,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Fullbackup) + .inc(); + // Check that the timeline exists self.handle_basebackup_request( pgb, @@ -1771,6 +1792,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::ImportBasebackup) + .inc(); + match self .handle_import_basebackup( pgb, @@ -1818,6 +1843,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::ImportWal) + .inc(); + match self .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) .await @@ -1855,6 +1884,10 @@ where self.check_permission(Some(tenant_shard_id.tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::LeaseLsn) + .inc(); + // The caller is responsible for providing correct lsn. let lsn = Lsn::from_str(params[2]) .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; @@ -1886,6 +1919,10 @@ where self.check_permission(Some(tenant_id))?; + COMPUTE_COMMANDS_COUNTERS + .for_command(ComputeCommandKind::Show) + .inc(); + let tenant = self .get_active_tenant_with_timeout( tenant_id,