Skip to content

Commit

Permalink
Optional removal of metrics from Prometheus PushGateway on shutdown (#…
Browse files Browse the repository at this point in the history
…14935)

* Optional removal of metrics from Prometheus PushGateway on shutdown

* Make pushGatewayDeleteOnShutdown property nullable

* Add waitForShutdownDelay property

* Fix unit test

* Address PR comments

* Address PR comments

* Add explanation on why it is useful to have deletePushGatewayMetricsOnShutdown

* Fix spelling error

* Fix spelling error
  • Loading branch information
BartMiki authored Dec 13, 2023
1 parent 8bc7a5f commit 4670a76
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 21 deletions.
7 changes: 4 additions & 3 deletions docs/development/extensions-contrib/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ To use this Apache Druid extension, [include](../../configuration/extensions.md#

This extension exposes [Druid metrics](https://druid.apache.org/docs/latest/operations/metrics.html) for collection by a Prometheus server (https://prometheus.io/).

Emitter is enabled by setting `druid.emitter=prometheus` [configs](https://druid.apache.org/docs/latest/configuration/index.html#enabling-metrics) or include `prometheus` in the composing emitter list.

Emitter is enabled by setting `druid.emitter=prometheus` [configs](https://druid.apache.org/docs/latest/configuration/index.html#enabling-metrics) or include `prometheus` in the composing emitter list.

## Configuration

Expand All @@ -47,6 +46,8 @@ All the configuration parameters for the Prometheus emitter are under `druid.emi
| `druid.emitter.prometheus.pushGatewayAddress` | Pushgateway address. Required if using `pushgateway` strategy. | no | none |
| `druid.emitter.prometheus.flushPeriod` | Emit metrics to Pushgateway every `flushPeriod` seconds. Required if `pushgateway` strategy is used. | no | 15 |
| `druid.emitter.prometheus.extraLabels` | JSON key-value pairs for additional labels on all metrics. Keys (label names) must match the regex `[a-zA-Z_:][a-zA-Z0-9_:]*`. Example: `{"cluster_name": "druid_cluster1", "env": "staging"}`. | no | none |
| `druid.emitter.prometheus.deletePushGatewayMetricsOnShutdown` | Flag to delete metrics from Pushgateway on task shutdown. Works only if `pushgateway` strategy is used. This feature allows to delete a stale metrics from batch executed tasks. Otherwise, the Pushgateway will store these stale metrics indefinitely as there is [no time to live mechanism](https://github.com/prometheus/pushgateway/issues/117), using the memory to hold data that was already scraped by Prometheus. | no | false |
| `druid.emitter.prometheus.waitForShutdownDelay` | Time in milliseconds to wait for peon tasks to delete metrics from the Pushgateway on shutdown (e.g. 60_000). Applicable only when `pushgateway` strategy is used and `pushGatewayDeleteOnShutdown` is set to true. Be aware that there's no guarantee that a peon task will delete metrics from the gateway if the configured delay is more than the [Peon's `druid.indexer.task.gracefulShutdownTimeout`](https://druid.apache.org/docs/latest/configuration/#additional-peon-configuration). It is recommended that this value is 1.2 times the configured Prometheus `scrape_interval` of Pushgateway. This ensures, that the metrics should be scraped before the cleanup. | no | none |

### Ports for colocated Druid processes

Expand Down Expand Up @@ -110,5 +111,5 @@ the service name. For example:
"druid/coordinator-segment/count" : { "dimensions" : ["dataSource"], "type" : "gauge" },
"druid/historical-segment/count" : { "dimensions" : ["dataSource", "tier", "priority"], "type" : "gauge" }
```

For most use cases, the default mapping is sufficient.
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,13 @@ public PrometheusEmitter(PrometheusEmitterConfig config)
{
this.config = config;
this.strategy = config.getStrategy();
metrics = new Metrics(config.getNamespace(), config.getDimensionMapPath(), config.isAddHostAsLabel(), config.isAddServiceAsLabel(), config.getExtraLabels());
metrics = new Metrics(
config.getNamespace(),
config.getDimensionMapPath(),
config.isAddHostAsLabel(),
config.isAddServiceAsLabel(),
config.getExtraLabels()
);
}


Expand Down Expand Up @@ -164,7 +170,8 @@ private void emitMetric(ServiceMetricEvent metricEvent)
} else if (metric.getCollector() instanceof Gauge) {
((Gauge) metric.getCollector()).labels(labelValues).set(value.doubleValue());
} else if (metric.getCollector() instanceof Histogram) {
((Histogram) metric.getCollector()).labels(labelValues).observe(value.doubleValue() / metric.getConversionFactor());
((Histogram) metric.getCollector()).labels(labelValues)
.observe(value.doubleValue() / metric.getConversionFactor());
} else {
log.error("Unrecognized metric type [%s]", metric.getCollector().getClass());
}
Expand Down Expand Up @@ -202,11 +209,36 @@ public void close()
{
if (strategy.equals(PrometheusEmitterConfig.Strategy.exporter)) {
if (server != null) {
server.stop();
server.close();
}
} else {
exec.shutdownNow();
flush();

try {
if (config.getWaitForShutdownDelay().getMillis() > 0) {
log.info("Waiting [%s]ms before deleting metrics from the push gateway.", config.getWaitForShutdownDelay().getMillis());
Thread.sleep(config.getWaitForShutdownDelay().getMillis());
}
}
catch (InterruptedException e) {
log.error(e, "Interrupted while waiting for shutdown delay. Deleting metrics from the push gateway now.");
}
finally {
deletePushGatewayMetrics();
}
}
}

private void deletePushGatewayMetrics()
{
if (pushGateway != null && config.isDeletePushGatewayMetricsOnShutdown()) {
try {
pushGateway.delete(config.getNamespace(), ImmutableMap.of(config.getNamespace(), identifier));
}
catch (IOException e) {
log.error(e, "Unable to delete prometheus metrics from push gateway");
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import com.google.common.base.Preconditions;
import org.apache.druid.error.DruidException;
import org.apache.druid.java.util.common.StringUtils;
import org.joda.time.Duration;

import javax.annotation.Nullable;
import java.util.Collections;
Expand Down Expand Up @@ -70,6 +71,12 @@ public class PrometheusEmitterConfig
@JsonProperty
private final Map<String, String> extraLabels;

@JsonProperty
private final boolean deletePushGatewayMetricsOnShutdown;

@JsonProperty
private final Duration waitForShutdownDelay;

@JsonCreator
public PrometheusEmitterConfig(
@JsonProperty("strategy") @Nullable Strategy strategy,
Expand All @@ -80,7 +87,9 @@ public PrometheusEmitterConfig(
@JsonProperty("addHostAsLabel") boolean addHostAsLabel,
@JsonProperty("addServiceAsLabel") boolean addServiceAsLabel,
@JsonProperty("flushPeriod") Integer flushPeriod,
@JsonProperty("extraLabels") @Nullable Map<String, String> extraLabels
@JsonProperty("extraLabels") @Nullable Map<String, String> extraLabels,
@JsonProperty("deletePushGatewayMetricsOnShutdown") @Nullable Boolean deletePushGatewayMetricsOnShutdown,
@JsonProperty("waitForShutdownDelay") @Nullable Long waitForShutdownDelay
)
{
this.strategy = strategy != null ? strategy : Strategy.exporter;
Expand All @@ -103,6 +112,23 @@ public PrometheusEmitterConfig(
this.addHostAsLabel = addHostAsLabel;
this.addServiceAsLabel = addServiceAsLabel;
this.extraLabels = extraLabels != null ? extraLabels : Collections.emptyMap();
this.deletePushGatewayMetricsOnShutdown = deletePushGatewayMetricsOnShutdown != null && deletePushGatewayMetricsOnShutdown;

if (waitForShutdownDelay == null) {
this.waitForShutdownDelay = Duration.ZERO;
} else if (waitForShutdownDelay >= 0) {
this.waitForShutdownDelay = Duration.millis(waitForShutdownDelay);
} else {
throw DruidException.forPersona(DruidException.Persona.OPERATOR)
.ofCategory(DruidException.Category.INVALID_INPUT)
.build(
StringUtils.format(
"Invalid value for waitForShutdownDelay[%s] specified, waitForShutdownDelay must be >= 0.",
waitForShutdownDelay
)
);
}

// Validate label names early to prevent Prometheus exceptions later.
for (String key : this.extraLabels.keySet()) {
if (!PATTERN.matcher(key).matches()) {
Expand Down Expand Up @@ -165,6 +191,16 @@ public Map<String, String> getExtraLabels()
return extraLabels;
}

public boolean isDeletePushGatewayMetricsOnShutdown()
{
return deletePushGatewayMetricsOnShutdown;
}

public Duration getWaitForShutdownDelay()
{
return waitForShutdownDelay;
}

public enum Strategy
{
exporter, pushgateway
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public void testEmitterConfigWithBadExtraLabels()

// Expect an exception thrown by our own PrometheusEmitterConfig due to invalid label key
Exception exception = Assert.assertThrows(DruidException.class, () -> {
new PrometheusEmitterConfig(PrometheusEmitterConfig.Strategy.exporter, null, null, 0, null, false, true, 60, extraLabels);
new PrometheusEmitterConfig(PrometheusEmitterConfig.Strategy.exporter, null, null, 0, null, false, true, 60, extraLabels, false, null);
});

String expectedMessage = "Invalid metric label name [label Name]. Label names must conform to the pattern [[a-zA-Z_:][a-zA-Z0-9_:]*]";
Expand Down
Loading

0 comments on commit 4670a76

Please sign in to comment.