Skip to content

Commit

Permalink
swarm: add a peer dial latency metric (#2959)
Browse files Browse the repository at this point in the history
  • Loading branch information
sukunrt authored Oct 17, 2024
1 parent 26c9014 commit af2042c
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 4 deletions.
122 changes: 122 additions & 0 deletions dashboards/swarm/swarm.json
Original file line number Diff line number Diff line change
Expand Up @@ -2598,6 +2598,128 @@
],
"title": "Dial Success Rates",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"log": 2,
"type": "log"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 67
},
"id": 50,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
"expr": "histogram_quantile(0.5, sum(rate(libp2p_swarm_dial_latency_seconds_bucket{outcome=\"success\", instance=~\"$instance\"}[$__rate_interval])) by (le))",
"instant": false,
"legendFormat": "50th percentile",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum(rate(libp2p_swarm_dial_latency_seconds_bucket{outcome=\"success\", instance=~\"$instance\"}[$__rate_interval])) by (le))",
"hide": false,
"legendFormat": "90th percentile",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum(rate(libp2p_swarm_dial_latency_seconds_bucket{outcome=\"success\", instance=~\"$instance\"}[$__rate_interval])) by (le))",
"hide": false,
"legendFormat": "95th percentile",
"range": true,
"refId": "C"
}
],
"title": "Peer Dial Latency (Seconds)",
"type": "timeseries"
},
{
"datasource": {
Expand Down
2 changes: 1 addition & 1 deletion p2p/net/swarm/dial_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ loop:
case req, ok := <-w.reqch:
if !ok {
if w.s.metricsTracer != nil {
w.s.metricsTracer.DialCompleted(w.connected, totalDials)
w.s.metricsTracer.DialCompleted(w.connected, totalDials, time.Since(startTime))
}
return
}
Expand Down
15 changes: 13 additions & 2 deletions p2p/net/swarm/swarm_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,15 @@ var (
},
[]string{"outcome", "num_dials"},
)
dialLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: metricNamespace,
Name: "dial_latency_seconds",
Help: "time taken to establish connection with the peer",
Buckets: []float64{0.001, 1.3, 35},
},
[]string{"outcome", "num_dials"},
)
dialRankingDelay = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: metricNamespace,
Expand Down Expand Up @@ -118,6 +127,7 @@ var (
connHandshakeLatency,
dialsPerPeer,
dialRankingDelay,
dialLatency,
blackHoleSuccessCounterSuccessFraction,
blackHoleSuccessCounterState,
blackHoleSuccessCounterNextRequestAllowedAfter,
Expand All @@ -129,7 +139,7 @@ type MetricsTracer interface {
ClosedConnection(network.Direction, time.Duration, network.ConnectionState, ma.Multiaddr)
CompletedHandshake(time.Duration, network.ConnectionState, ma.Multiaddr)
FailedDialing(ma.Multiaddr, error, error)
DialCompleted(success bool, totalDials int)
DialCompleted(success bool, totalDials int, latency time.Duration)
DialRankingDelay(d time.Duration)
UpdatedBlackHoleSuccessCounter(name string, state BlackHoleState, nextProbeAfter int, successFraction float64)
}
Expand Down Expand Up @@ -250,7 +260,7 @@ func (m *metricsTracer) FailedDialing(addr ma.Multiaddr, dialErr error, cause er
dialError.WithLabelValues(*tags...).Inc()
}

func (m *metricsTracer) DialCompleted(success bool, totalDials int) {
func (m *metricsTracer) DialCompleted(success bool, totalDials int, latency time.Duration) {
tags := metricshelper.GetStringSlice()
defer metricshelper.PutStringSlice(tags)
if success {
Expand All @@ -268,6 +278,7 @@ func (m *metricsTracer) DialCompleted(success bool, totalDials int) {
}
*tags = append(*tags, numDials)
dialsPerPeer.WithLabelValues(*tags...).Inc()
dialLatency.WithLabelValues(*tags...).Observe(latency.Seconds())
}

func (m *metricsTracer) DialRankingDelay(d time.Duration) {
Expand Down
2 changes: 1 addition & 1 deletion p2p/net/swarm/swarm_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func TestMetricsNoAllocNoCover(t *testing.T) {
mt.CompletedHandshake(time.Duration(mrand.Intn(100))*time.Second, randItem(connections), randItem(addrs))
},
"FailedDialing": func() { mt.FailedDialing(randItem(addrs), randItem(errors), randItem(errors)) },
"DialCompleted": func() { mt.DialCompleted(mrand.Intn(2) == 1, mrand.Intn(10)) },
"DialCompleted": func() { mt.DialCompleted(mrand.Intn(2) == 1, mrand.Intn(10), time.Duration(mrand.Intn(1000_000_000))) },
"DialRankingDelay": func() { mt.DialRankingDelay(time.Duration(mrand.Intn(1e10))) },
"UpdatedBlackHoleSuccessCounter": func() {
mt.UpdatedBlackHoleSuccessCounter(
Expand Down

0 comments on commit af2042c

Please sign in to comment.