Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve metrics descriptions and naming #2409

Merged
merged 11 commits into from
Jul 21, 2022
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- Improved naming and description of some telemetry metrics and added
histogram buckets for `tx_latency` metrics.
In addition the default value of the configuration `tx_confirmation`
in Hermes `config.toml` has been changed from `true` to `false`.
([#2408](https://github.com/informalsystems/ibc-rs/issues/2408))
7 changes: 4 additions & 3 deletions config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ clear_on_start = true
# (to any chain in this config file) have been successfully delivered.
# If they have not been, and `clear_interval = 0`, then those packets are
# queued up for re-submission.
# Experimental feature. Affects telemetry if set to false.
# [Default: true]
tx_confirmation = true
# Experimental feature. If set to `false`, the following telemetry metrics will be disabled:
# `ibc_acknowledgment_packets`, `ibc_receive_packets` and `ibc_timeout_packets`.
# [Default: false]
tx_confirmation = false

# The REST section defines parameters for Hermes' built-in RESTful API.
# https://hermes.informal.systems/rest.html
Expand Down
210 changes: 105 additions & 105 deletions guide/src/telemetry.md

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions relayer/src/chain/tracking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ pub enum TrackingId {
/// corresponding event batch, eg. when performing actions from
/// the CLI or during packet clearing.
Static(&'static str),
/// Random identifier used to track latency of packet clearing.
ClearedUuid(Uuid),
}

impl TrackingId {
Expand All @@ -26,6 +28,10 @@ impl TrackingId {
pub fn new_static(s: &'static str) -> Self {
Self::Static(s)
}

pub fn new_cleared_uuid() -> Self {
Self::ClearedUuid(Uuid::new_v4())
}
}

impl fmt::Display for TrackingId {
Expand All @@ -37,6 +43,13 @@ impl fmt::Display for TrackingId {
s.fmt(f)
}
TrackingId::Static(s) => s.fmt(f),
TrackingId::ClearedUuid(u) => {
let mut uuid = "cleared/".to_owned();
let mut s = u.to_string();
s.truncate(8);
uuid.push_str(&s);
uuid.fmt(f)
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions relayer/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pub mod default {
}

pub fn tx_confirmation() -> bool {
true
false
ancazamfir marked this conversation as resolved.
Show resolved Hide resolved
}

pub fn clear_packets_interval() -> u64 {
Expand Down Expand Up @@ -155,7 +155,7 @@ impl Default for ModeConfig {
enabled: true,
clear_interval: default::clear_packets_interval(),
clear_on_start: true,
tx_confirmation: true,
tx_confirmation: false,
},
}
}
Expand Down
55 changes: 30 additions & 25 deletions relayer/src/link/relay_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,8 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
}

fn relay_pending_packets(&self, height: Option<Height>) -> Result<(), LinkError> {
let tracking_id = TrackingId::new_static("relay pending packets");
let tracking_id = TrackingId::new_cleared_uuid();
telemetry!(received_event_batch, tracking_id);

for i in 1..=MAX_RETRIES {
let cleared = self
Expand Down Expand Up @@ -1090,7 +1091,7 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
// Update telemetry info
telemetry!({
for e in events_chunk.clone() {
self.record_cleared_send_packet_and_acknowledgment(e);
self.record_cleared_send_packet(e);
}
});
self.events_to_operational_data(TrackedEvents::new(events_chunk, tracking_id))?;
Expand Down Expand Up @@ -1137,6 +1138,7 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
&self.path_id,
query_write_ack_events,
) {
telemetry!(self.record_cleared_acknowledgments(events_chunk.clone()));
self.events_to_operational_data(TrackedEvents::new(events_chunk, tracking_id))?;
}

Expand Down Expand Up @@ -1762,28 +1764,32 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
}

#[cfg(feature = "telemetry")]
fn record_cleared_send_packet_and_acknowledgment(&self, event: IbcEvent) {
match event {
IbcEvent::SendPacket(send_packet_ev) => {
ibc_telemetry::global().send_packet_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
ibc_telemetry::global().cleared_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
}
IbcEvent::WriteAcknowledgement(write_ack_ev) => {
ibc_telemetry::global().acknowledgement_count(
fn record_cleared_send_packet(&self, event: IbcEvent) {
if let IbcEvent::SendPacket(send_packet_ev) = event {
ibc_telemetry::global().send_packet_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
ibc_telemetry::global().clear_send_packet_count(
send_packet_ev.packet.sequence.into(),
send_packet_ev.height().revision_height(),
&self.src_chain().id(),
self.src_channel_id(),
self.src_port_id(),
&self.dst_chain().id(),
);
}
}

#[cfg(feature = "telemetry")]
fn record_cleared_acknowledgments(&self, events: Vec<IbcEvent>) {
for e in events {
if let IbcEvent::WriteAcknowledgement(write_ack_ev) = e {
ibc_telemetry::global().clear_acknowledgment_packet_count(
write_ack_ev.packet.sequence.into(),
write_ack_ev.height().revision_height(),
&self.dst_chain().id(),
Expand All @@ -1792,7 +1798,6 @@ impl<ChainA: ChainHandle, ChainB: ChainHandle> RelayPath<ChainA, ChainB> {
&self.src_chain().id(),
);
}
_ => {}
}
}
}
79 changes: 58 additions & 21 deletions telemetry/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,11 @@ pub struct TelemetryState {
/// Counts the number of WriteAcknowledgement Hermes transfers.
acknowledgement_count: Counter<u64>,

/// Counts the number of SendPacket Hermes transfers from ClearPacket.
cleared_count: Counter<u64>,
/// Counts the number of SendPacket events Hermes processes from ClearPendingPackets.
cleared_send_packet_count: Counter<u64>,

/// Counts the number of WriteAcknowledgment events Hermes processes from ClearPendingPackets.
cleared_acknowledgment_count: Counter<u64>,

/// Records the sequence number of the oldest SendPacket for which no
/// WriteAcknowledgement has been received. The value is 0 if all the
Expand Down Expand Up @@ -162,7 +165,9 @@ impl TelemetryState {
KeyValue::new("src_port", src_port.to_string()),
];

self.receive_packets.add(count, labels);
if count > 0 {
self.receive_packets.add(count, labels);
}
}

/// Number of acknowledgment packets relayed, per channel
Expand All @@ -179,7 +184,9 @@ impl TelemetryState {
KeyValue::new("src_port", src_port.to_string()),
];

self.acknowledgment_packets.add(count, labels);
if count > 0 {
self.acknowledgment_packets.add(count, labels);
}
}

/// Number of timeout packets relayed, per channel
Expand All @@ -196,7 +203,9 @@ impl TelemetryState {
KeyValue::new("src_port", src_port.to_string()),
];

self.timeout_packets.add(count, labels);
if count > 0 {
self.timeout_packets.add(count, labels);
}
}

/// Number of queries emitted by the relayer, per chain and query type
Expand Down Expand Up @@ -351,7 +360,7 @@ impl TelemetryState {
self.acknowledgement_count.add(1, labels);
}

pub fn cleared_count(
pub fn clear_send_packet_count(
&self,
_seq_nr: u64,
_height: u64,
Expand All @@ -367,7 +376,26 @@ impl TelemetryState {
KeyValue::new("port", port_id.to_string()),
];

self.cleared_count.add(1, labels);
self.cleared_send_packet_count.add(1, labels);
}

pub fn clear_acknowledgment_packet_count(
&self,
_seq_nr: u64,
_height: u64,
chain_id: &ChainId,
channel_id: &ChannelId,
port_id: &PortId,
counterparty_chain_id: &ChainId,
) {
let labels: &[KeyValue; 4] = &[
KeyValue::new("chain", chain_id.to_string()),
KeyValue::new("counterparty", counterparty_chain_id.to_string()),
KeyValue::new("channel", channel_id.to_string()),
KeyValue::new("port", port_id.to_string()),
];

self.cleared_acknowledgment_count.add(1, labels);
}

pub fn record_send_history(
Expand Down Expand Up @@ -493,9 +521,14 @@ impl AggregatorSelector for CustomAggregatorSelector {
// Prometheus' supports only collector for histogram, sum, and last value aggregators.
// https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418
// TODO: Once quantile sketches are supported, replace histograms with that.
// For the moment, disable histogram buckets since no values make sense for all use-cases.
"tx_latency_submitted" => Some(Arc::new(histogram(descriptor, &[]))),
"tx_latency_confirmed" => Some(Arc::new(histogram(descriptor, &[]))),
"tx_latency_submitted" => Some(Arc::new(histogram(
descriptor,
&[200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0],
))),
"tx_latency_confirmed" => Some(Arc::new(histogram(
descriptor,
&[1000.0, 5000.0, 9000.0, 13000.0, 17000.0, 20000.0],
))),
_ => Some(Arc::new(sum())),
}
}
Expand Down Expand Up @@ -529,17 +562,17 @@ impl Default for TelemetryState {

receive_packets: meter
.u64_counter("ibc_receive_packets")
.with_description("Number of receive packets relayed per channel")
.with_description("Number of confirmed receive packets relayed per channel. Available if relayer runs with Tx confirmation enabled")
.init(),

acknowledgment_packets: meter
.u64_counter("ibc_acknowledgment_packets")
.with_description("Number of acknowledgment packets relayed per channel")
.with_description("Number of confirmed acknowledgment packets relayed per channel. Available if relayer runs with Tx confirmation enabled")
.init(),

timeout_packets: meter
.u64_counter("ibc_timeout_packets")
.with_description("Number of timeout packets relayed per channel")
.with_description("Number of confirmed timeout packets relayed per channel. Available if relayer runs with Tx confirmation enabled")
.init(),

queries: meter
Expand Down Expand Up @@ -571,23 +604,27 @@ impl Default for TelemetryState {

wallet_balance: meter
.f64_value_recorder("wallet_balance")
.with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in
precision might be introduced in the displayed value")
.with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in precision might be introduced in the displayed value")
.init(),

send_packet_count: meter
.u64_counter("send_packet_count")
.with_description("Number of SendPacket relayed")
.with_description("Number of SendPacket events processed")
.init(),

acknowledgement_count: meter
.u64_counter("acknowledgement_count")
.with_description("Number of WriteAcknowledgement relayed")
.with_description("Number of WriteAcknowledgement events processed")
.init(),

cleared_send_packet_count: meter
.u64_counter("cleared_send_packet_count")
.with_description("Number of SendPacket events processed during the initial and periodic clearing")
.init(),

cleared_count: meter
.u64_counter("cleared_count")
.with_description("Number of SendPacket relayed through ClearPendingPackets")
cleared_acknowledgment_count: meter
.u64_counter("cleared_acknowledgment_count")
.with_description("Number of WriteAcknowledgment events processed during the initial and periodic clearing")
.init(),

tx_latency_submitted: meter
Expand Down Expand Up @@ -615,7 +652,7 @@ impl Default for TelemetryState {

oldest_sequence: meter
.u64_value_recorder("oldest_sequence")
.with_description("The sequence number of the oldest pending SendPacket. If this value is 0, it means there are no pending SendPacket")
.with_description("The sequence number of the oldest SendPacket event observed without its corresponding WriteAcknowledgement event. If this value is 0, it means Hermes observed a WriteAcknowledgment event for all the SendPacket events")
.init(),

oldest_timestamp: meter
Expand Down