From af2042c2ce97f2cdc2f9312dd7ec1e4e7b645426 Mon Sep 17 00:00:00 2001 From: sukun Date: Thu, 17 Oct 2024 08:33:24 +0530 Subject: [PATCH] swarm: add a peer dial latency metric (#2959) --- dashboards/swarm/swarm.json | 122 ++++++++++++++++++++++++++++ p2p/net/swarm/dial_worker.go | 2 +- p2p/net/swarm/swarm_metrics.go | 15 +++- p2p/net/swarm/swarm_metrics_test.go | 2 +- 4 files changed, 137 insertions(+), 4 deletions(-) diff --git a/dashboards/swarm/swarm.json b/dashboards/swarm/swarm.json index d206de29c4..e8536ea8e6 100644 --- a/dashboards/swarm/swarm.json +++ b/dashboards/swarm/swarm.json @@ -2598,6 +2598,128 @@ ], "title": "Dial Success Rates", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 67 + }, + "id": 50, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "histogram_quantile(0.5, sum(rate(libp2p_swarm_dial_latency_seconds_bucket{outcome=\"success\", instance=~\"$instance\"}[$__rate_interval])) by (le))", + "instant": false, + "legendFormat": "50th percentile", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum(rate(libp2p_swarm_dial_latency_seconds_bucket{outcome=\"success\", instance=~\"$instance\"}[$__rate_interval])) by (le))", + "hide": false, + "legendFormat": "90th percentile", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(libp2p_swarm_dial_latency_seconds_bucket{outcome=\"success\", instance=~\"$instance\"}[$__rate_interval])) by (le))", + "hide": false, + "legendFormat": "95th percentile", + "range": true, + "refId": "C" + } + ], + "title": "Peer Dial Latency (Seconds)", + "type": "timeseries" }, { "datasource": { diff --git a/p2p/net/swarm/dial_worker.go b/p2p/net/swarm/dial_worker.go index 360a99e2ab..9d097f3c26 100644 --- a/p2p/net/swarm/dial_worker.go +++ b/p2p/net/swarm/dial_worker.go @@ -162,7 +162,7 @@ loop: case req, ok := <-w.reqch: if !ok { if w.s.metricsTracer != nil { - w.s.metricsTracer.DialCompleted(w.connected, totalDials) + w.s.metricsTracer.DialCompleted(w.connected, totalDials, time.Since(startTime)) } return } diff --git a/p2p/net/swarm/swarm_metrics.go b/p2p/net/swarm/swarm_metrics.go index bedc936d95..6da4993126 100644 --- a/p2p/net/swarm/swarm_metrics.go +++ b/p2p/net/swarm/swarm_metrics.go @@ -77,6 +77,15 @@ var ( }, []string{"outcome", "num_dials"}, ) + dialLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metricNamespace, + Name: "dial_latency_seconds", + Help: "time taken to establish connection with the peer", + Buckets: []float64{0.001, 1.3, 35}, + }, + []string{"outcome", "num_dials"}, + ) dialRankingDelay = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: metricNamespace, @@ -118,6 +127,7 @@ var ( connHandshakeLatency, dialsPerPeer, dialRankingDelay, + dialLatency, blackHoleSuccessCounterSuccessFraction, blackHoleSuccessCounterState, blackHoleSuccessCounterNextRequestAllowedAfter, @@ -129,7 +139,7 @@ type MetricsTracer interface { ClosedConnection(network.Direction, time.Duration, network.ConnectionState, ma.Multiaddr) CompletedHandshake(time.Duration, network.ConnectionState, ma.Multiaddr) FailedDialing(ma.Multiaddr, error, error) - DialCompleted(success bool, totalDials int) + DialCompleted(success bool, totalDials int, latency time.Duration) DialRankingDelay(d time.Duration) UpdatedBlackHoleSuccessCounter(name string, state BlackHoleState, nextProbeAfter int, successFraction float64) } @@ -250,7 +260,7 @@ func (m *metricsTracer) FailedDialing(addr ma.Multiaddr, dialErr error, cause er dialError.WithLabelValues(*tags...).Inc() } -func (m *metricsTracer) DialCompleted(success bool, totalDials int) { +func (m *metricsTracer) DialCompleted(success bool, totalDials int, latency time.Duration) { tags := metricshelper.GetStringSlice() defer metricshelper.PutStringSlice(tags) if success { @@ -268,6 +278,7 @@ func (m *metricsTracer) DialCompleted(success bool, totalDials int) { } *tags = append(*tags, numDials) dialsPerPeer.WithLabelValues(*tags...).Inc() + dialLatency.WithLabelValues(*tags...).Observe(latency.Seconds()) } func (m *metricsTracer) DialRankingDelay(d time.Duration) { diff --git a/p2p/net/swarm/swarm_metrics_test.go b/p2p/net/swarm/swarm_metrics_test.go index ef136df28a..151765931b 100644 --- a/p2p/net/swarm/swarm_metrics_test.go +++ b/p2p/net/swarm/swarm_metrics_test.go @@ -92,7 +92,7 @@ func TestMetricsNoAllocNoCover(t *testing.T) { mt.CompletedHandshake(time.Duration(mrand.Intn(100))*time.Second, randItem(connections), randItem(addrs)) }, "FailedDialing": func() { mt.FailedDialing(randItem(addrs), randItem(errors), randItem(errors)) }, - "DialCompleted": func() { mt.DialCompleted(mrand.Intn(2) == 1, mrand.Intn(10)) }, + "DialCompleted": func() { mt.DialCompleted(mrand.Intn(2) == 1, mrand.Intn(10), time.Duration(mrand.Intn(1000_000_000))) }, "DialRankingDelay": func() { mt.DialRankingDelay(time.Duration(mrand.Intn(1e10))) }, "UpdatedBlackHoleSuccessCounter": func() { mt.UpdatedBlackHoleSuccessCounter(