Skip to content

Commit

Permalink
consumption_metrics: fix periodicness behavior & reporting (#5625)
Browse files Browse the repository at this point in the history
Before this PR, the ticker was running at default miss behavior `Delay`.
For example, here is the startup output with 25k tenants:

```
2023-10-19T09:57:21.682466Z  INFO synthetic_size_worker: starting calculate_synthetic_size_worker
2023-10-19T10:50:44.678202Z  WARN synthetic_size_worker: task iteration took longer than the configured period elapsed=3202.995707156s period=10m task=ConsumptionMetricsSyntheticSizeWorker
2023-10-19T10:52:17.408056Z  WARN synthetic_size_worker: task iteration took longer than the configured period elapsed=2695.72556035s period=10m task=ConsumptionMetricsSyntheticSizeWorker
```

The first message's `elapsed` value is correct. It matches the
delta between the log line timestamps.

The second one is logged ca 1.5min after, though, but reports a much
larger
`elapsed` than 1.5min.

This PR fixes the behavior by copying what `eviction_task.rs` does.
  • Loading branch information
problame committed Oct 23, 2023
1 parent 94b4e76 commit c6ca1d7
Showing 1 changed file with 26 additions and 19 deletions.
45 changes: 26 additions & 19 deletions pageserver/src/consumption_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use reqwest::Url;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tokio::time::Instant;
use tracing::*;
use utils::id::NodeId;

Expand Down Expand Up @@ -88,22 +89,12 @@ pub async fn collect_metrics(

let node_id = node_id.to_string();

// reminder: ticker is ready immediatedly
let mut ticker = tokio::time::interval(metric_collection_interval);

loop {
let tick_at = tokio::select! {
_ = cancel.cancelled() => return Ok(()),
tick_at = ticker.tick() => tick_at,
};
let started_at = Instant::now();

// these are point in time, with variable "now"
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

if metrics.is_empty() {
continue;
}

let metrics = Arc::new(metrics);

// why not race cancellation here? because we are one of the last tasks, and if we are
Expand Down Expand Up @@ -142,10 +133,19 @@ pub async fn collect_metrics(
let (_, _) = tokio::join!(flush, upload);

crate::tenant::tasks::warn_when_period_overrun(
tick_at.elapsed(),
started_at.elapsed(),
metric_collection_interval,
BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
);

let res = tokio::time::timeout_at(
started_at + metric_collection_interval,
task_mgr::shutdown_token().cancelled(),
)
.await;
if res.is_ok() {
return Ok(());
}
}
}

Expand Down Expand Up @@ -244,16 +244,14 @@ async fn calculate_synthetic_size_worker(
ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("starting calculate_synthetic_size_worker");
scopeguard::defer! {
info!("calculate_synthetic_size_worker stopped");
};

// reminder: ticker is ready immediatedly
let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

loop {
let tick_at = tokio::select! {
_ = task_mgr::shutdown_watcher() => return Ok(()),
tick_at = ticker.tick() => tick_at,
};
let started_at = Instant::now();

let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
Expand Down Expand Up @@ -281,9 +279,18 @@ async fn calculate_synthetic_size_worker(
}

crate::tenant::tasks::warn_when_period_overrun(
tick_at.elapsed(),
started_at.elapsed(),
synthetic_size_calculation_interval,
BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
);

let res = tokio::time::timeout_at(
started_at + synthetic_size_calculation_interval,
task_mgr::shutdown_token().cancelled(),
)
.await;
if res.is_ok() {
return Ok(());
}
}
}

1 comment on commit c6ca1d7

@github-actions
Copy link

@github-actions github-actions bot commented on c6ca1d7 Oct 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2409 tests run: 2286 passed, 0 failed, 123 skipped (full report)


Flaky tests (1)

Postgres 16

  • test_crafted_wal_end[last_wal_record_crossing_segment]: release

Code coverage (full report)

  • functions: 53.0% (8471 of 15996 functions)
  • lines: 80.7% (49610 of 61437 lines)

The comment gets automatically updated with the latest test results
c6ca1d7 at 2023-10-23T16:22:05.856Z :recycle:

Please sign in to comment.