Skip to content

Commit

Permalink
feat(pageserver): add metrics for number of valid leases after each r…
Browse files Browse the repository at this point in the history
…efresh (#8147)

Part of #7497, closes #8120.

## Summary of changes

This PR adds a metric to track the number of valid leases after `GCInfo`
gets refreshed each time.

Besides this metric, we should also track disk space and synthetic size
(after #8071 is closed) to make sure leases are used properly.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
  • Loading branch information
yliang412 committed Jun 25, 2024
1 parent 9b2f941 commit 961fc0b
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 0 deletions.
17 changes: 17 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,15 @@ static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});

static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_valid_lsn_lease_count",
"The number of valid leases after refreshing gc info.",
&["tenant_id", "shard_id", "timeline_id"],
)
.expect("failed to define a metric")
});

pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
Expand Down Expand Up @@ -2055,6 +2064,8 @@ pub(crate) struct TimelineMetrics {
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
pub evictions: IntCounter,
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
/// Number of valid LSN leases.
pub valid_lsn_lease_count_gauge: UIntGauge,
shutdown: std::sync::atomic::AtomicBool,
}

Expand Down Expand Up @@ -2153,6 +2164,10 @@ impl TimelineMetrics {
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
.build(&tenant_id, &shard_id, &timeline_id);

let valid_lsn_lease_count_gauge = VALID_LSN_LEASE_COUNT
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();

TimelineMetrics {
tenant_id,
shard_id,
Expand All @@ -2175,6 +2190,7 @@ impl TimelineMetrics {
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
valid_lsn_lease_count_gauge,
shutdown: std::sync::atomic::AtomicBool::default(),
}
}
Expand Down Expand Up @@ -2224,6 +2240,7 @@ impl TimelineMetrics {
}
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);

self.evictions_with_low_residence_duration
.write()
Expand Down
5 changes: 5 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2960,6 +2960,11 @@ impl Tenant {
let now = SystemTime::now();
target.leases.retain(|_, lease| !lease.is_expired(&now));

timeline
.metrics
.valid_lsn_lease_count_gauge
.set(target.leases.len() as u64);

match gc_cutoffs.remove(&timeline.timeline_id) {
Some(cutoffs) => {
target.retain_lsns = branchpoints;
Expand Down
1 change: 1 addition & 0 deletions test_runner/fixtures/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
"pageserver_evictions_total",
"pageserver_evictions_with_low_residence_duration_total",
"pageserver_aux_file_estimated_size",
"pageserver_valid_lsn_lease_count",
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold
# "pageserver_broken_tenants_count" -- used only for broken
Expand Down

1 comment on commit 961fc0b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3010 tests run: 2883 passed, 1 failed, 126 skipped (full report)


Failures on Postgres 16

  • test_empty_branch_remote_storage_upload: debug
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_empty_branch_remote_storage_upload[debug-pg16]"

Test coverage report is not available

The comment gets automatically updated with the latest test results
961fc0b at 2024-06-25T17:11:29.783Z :recycle:

Please sign in to comment.