Skip to content

Commit

Permalink
Revert "Export additional metrics to Prometheus (ray-project#14061)" (r…
Browse files Browse the repository at this point in the history
…ay-project#14134)

This reverts commit 82539f2.
  • Loading branch information
simon-mo authored Feb 16, 2021
1 parent 019d84a commit 33316d4
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 74 deletions.
74 changes: 7 additions & 67 deletions dashboard/modules/reporter/reporter_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,25 +77,7 @@ def __init__(self, dashboard_agent):
"node_cpu": Gauge("node_cpu", "Total CPU usage on a ray node",
"percentage", ["ip"]),
"node_mem": Gauge("node_mem", "Total memory usage on a ray node",
"bytes", ["ip"]),
"node_disk_usage": Gauge("node_disk_usage",
"Total disk usage (bytes) on a ray node",
"bytes", ["ip"]),
"node_disk_utilization_percentage": Gauge(
"node_disk_utilization_percentage",
"Total disk utilization (percentage) on a ray node",
"percentage", ["ip"]),
"node_network_sent": Gauge("node_network_sent",
"Total network sent", "bytes", ["ip"]),
"node_network_received": Gauge("node_network_received",
"Total network received", "bytes",
["ip"]),
"node_network_send_speed": Gauge("node_network_send_speed",
"Network send speed", "bytes/sec",
["ip"]),
"node_network_receive_speed": Gauge("node_network_receive_speed",
"Network receive speed",
"bytes/sec", ["ip"]),
"mb", ["ip"]),
"raylet_cpu": Gauge("raylet_cpu",
"CPU usage of the raylet on a node.",
"percentage", ["ip", "pid"]),
Expand Down Expand Up @@ -255,10 +237,8 @@ def _get_all_stats(self):
self._network_stats_hist.append((now, network_stats))
self._network_stats_hist = self._network_stats_hist[-7:]
then, prev_network_stats = self._network_stats_hist[0]
prev_send, prev_recv = prev_network_stats
now_send, now_recv = network_stats
network_speed_stats = ((now_send - prev_send) / (now - then),
(now_recv - prev_recv) / (now - then))
netstats = ((network_stats[0] - prev_network_stats[0]) / (now - then),
(network_stats[1] - prev_network_stats[1]) / (now - then))
return {
"now": now,
"hostname": self._hostname,
Expand All @@ -271,8 +251,7 @@ def _get_all_stats(self):
"loadAvg": self._get_load_avg(),
"disk": self._get_disk_usage(),
"gpus": self._get_gpu_usage(),
"network": network_stats,
"network_speed": network_speed_stats,
"net": netstats,
"cmdline": self._get_raylet_cmdline(),
}

Expand All @@ -285,45 +264,10 @@ def _record_stats(self, stats):

# -- Mem per node --
total, avail, _ = stats["mem"]
mem_usage = float(total - avail)
mem_usage = float(total - avail) / 1e6
mem_record = Record(
gauge=self._gauges["node_mem"], value=mem_usage, tags={"ip": ip})

# -- Disk per node --
used, free = 0, 0
for entry in stats["disk"].values():
used += entry.used
free += entry.free
disk_utilization = float(used / (used + free)) * 100
disk_usage_record = Record(
gauge=self._gauges["node_disk_usage"], value=used, tags={"ip": ip})
disk_utilization_percentage_record = Record(
gauge=self._gauges["node_disk_utilization_percentage"],
value=disk_utilization,
tags={"ip": ip})

# -- Network speed (send/receive) stats per node --
network_stats = stats["network"]
network_sent_record = Record(
gauge=self._gauges["node_network_sent"],
value=network_stats[0],
tags={"ip": ip})
network_received_record = Record(
gauge=self._gauges["node_network_received"],
value=network_stats[1],
tags={"ip": ip})

# -- Network speed (send/receive) per node --
network_speed_stats = stats["network_speed"]
network_send_speed_record = Record(
gauge=self._gauges["node_network_send_speed"],
value=network_speed_stats[0],
tags={"ip": ip})
network_receive_speed_record = Record(
gauge=self._gauges["node_network_receive_speed"],
value=network_speed_stats[1],
tags={"ip": ip})

raylet_stats = self._get_raylet_stats()
raylet_pid = str(raylet_stats["pid"])
# -- raylet CPU --
Expand All @@ -346,12 +290,8 @@ def _record_stats(self, stats):
"pid": raylet_pid
})

self._metrics_agent.record_reporter_stats([
cpu_record, mem_record, disk_usage_record,
disk_utilization_percentage_record, network_sent_record,
network_received_record, network_send_speed_record,
network_receive_speed_record, raylet_cpu_record, raylet_mem_record
])
self._metrics_agent.record_reporter_stats(
[cpu_record, mem_record, raylet_cpu_record, raylet_mem_record])

async def _perform_iteration(self, aioredis_client):
"""Get any changes to the log files and push updates to Redis."""
Expand Down
8 changes: 1 addition & 7 deletions dashboard/modules/reporter/tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,7 @@ def test_case_stats_exist():
prom_addresses)
return all([
"ray_node_cpu" in metric_names, "ray_node_mem" in metric_names,
"ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names,
"ray_node_disk_usage" in metric_names,
"ray_node_disk_utilization_percentage" in metric_names,
"ray_node_network_sent" in metric_names,
"ray_node_network_received" in metric_names,
"ray_node_network_send_speed" in metric_names,
"ray_node_network_receive_speed" in metric_names
"ray_raylet_cpu" in metric_names, "ray_raylet_mem" in metric_names
])

def test_case_ip_correct():
Expand Down

0 comments on commit 33316d4

Please sign in to comment.