Skip to content

Commit

Permalink
Add counters for commands processed through the libpq page service API (
Browse files Browse the repository at this point in the history
#8089)

I was looking for metrics on how many computes are still using protocol
version 1 and 2. This provides counters for that as "pagestream" and
"pagestream_v2" commands, but also all the other commands. The new
metrics are global for the whole pageserver instance rather than
per-tenant, so the additional metrics bloat should be fairly small.
  • Loading branch information
hlinnaka committed Jun 26, 2024
1 parent 24ce73f commit 5b87180
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 1 deletion.
41 changes: 41 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,46 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
pub(crate) enum ComputeCommandKind {
PageStreamV2,
PageStream,
Basebackup,
GetLastRecordRlsn,
Fullbackup,
ImportBasebackup,
ImportWal,
LeaseLsn,
Show,
}

pub(crate) struct ComputeCommandCounters {
map: EnumMap<ComputeCommandKind, IntCounter>,
}

pub(crate) static COMPUTE_COMMANDS_COUNTERS: Lazy<ComputeCommandCounters> = Lazy::new(|| {
let inner = register_int_counter_vec!(
"pageserver_compute_commands",
"Number of compute -> pageserver commands processed",
&["command"]
)
.expect("failed to define a metric");

ComputeCommandCounters {
map: EnumMap::from_array(std::array::from_fn(|i| {
let command = <ComputeCommandKind as enum_map::Enum>::from_usize(i);
let command_str: &'static str = command.into();
inner.with_label_values(&[command_str])
})),
}
});

impl ComputeCommandCounters {
pub(crate) fn for_command(&self, command: ComputeCommandKind) -> &IntCounter {
&self.map[command]
}
}

// remote storage metrics

static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
Expand Down Expand Up @@ -2949,4 +2989,5 @@ pub fn preinitialize_metrics() {
Lazy::force(&RECONSTRUCT_TIME);
Lazy::force(&tenant_throttling::TIMELINE_GET);
Lazy::force(&BASEBACKUP_QUERY_TIME);
Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
}
39 changes: 38 additions & 1 deletion pageserver/src/page_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ use crate::basebackup::BasebackupError;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
use crate::pgdatadir_mapping::Version;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
Expand Down Expand Up @@ -1554,6 +1554,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStreamV2)
.inc();

self.handle_pagerequests(
pgb,
tenant_id,
Expand All @@ -1579,6 +1583,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::PageStream)
.inc();

self.handle_pagerequests(
pgb,
tenant_id,
Expand All @@ -1605,6 +1613,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Basebackup)
.inc();

let lsn = if let Some(lsn_str) = params.get(2) {
Some(
Lsn::from_str(lsn_str)
Expand Down Expand Up @@ -1662,6 +1674,11 @@ where
.record("timeline_id", field::display(timeline_id));

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::GetLastRecordRlsn)
.inc();

async {
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
Expand Down Expand Up @@ -1723,6 +1740,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Fullbackup)
.inc();

// Check that the timeline exists
self.handle_basebackup_request(
pgb,
Expand Down Expand Up @@ -1771,6 +1792,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::ImportBasebackup)
.inc();

match self
.handle_import_basebackup(
pgb,
Expand Down Expand Up @@ -1818,6 +1843,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::ImportWal)
.inc();

match self
.handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
.await
Expand Down Expand Up @@ -1855,6 +1884,10 @@ where

self.check_permission(Some(tenant_shard_id.tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::LeaseLsn)
.inc();

// The caller is responsible for providing correct lsn.
let lsn = Lsn::from_str(params[2])
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
Expand Down Expand Up @@ -1886,6 +1919,10 @@ where

self.check_permission(Some(tenant_id))?;

COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Show)
.inc();

let tenant = self
.get_active_tenant_with_timeout(
tenant_id,
Expand Down

1 comment on commit 5b87180

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2998 tests run: 2871 passed, 1 failed, 126 skipped (full report)


Failures on Postgres 16

  • test_pg_regress[4]: debug
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_pg_regress[debug-pg16-4]"
Flaky tests (2)

Postgres 14

  • test_subscriber_restart: release
  • test_delete_timeline_client_hangup: debug

Test coverage report is not available

The comment gets automatically updated with the latest test results
5b87180 at 2024-06-26T18:27:38.221Z :recycle:

Please sign in to comment.