Skip to content

Commit

Permalink
Torchserve Metrics support for Intel GPUs enabled (#3141)
Browse files Browse the repository at this point in the history
* Initial Changes

* Add util functions that parse number of gpus and metric from each gpus.

* Fix reading 3 GPU metrics from each device. To be tested on multi-gpu systems.

* system_metrics.py - TS_IPEX_GPU_ENABLE flag

* Final Commit

* Fix data type in metric reading

* Refactor gpu metric enabling as a custom collctor script.

* Refactor metric enabling files

* Fix lint changed file.

* Fix more lint changed file.

---------

Co-authored-by: Kanya-Mo <167922169+Kanya-Mo@users.noreply.github.com>
Co-authored-by: Mo, Kanya <kanya.mo@intel.com>
Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
  • Loading branch information
4 people authored Jun 25, 2024
1 parent c7bbf2c commit 1fc8bd0
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 0 deletions.
54 changes: 54 additions & 0 deletions examples/intel_extension_for_pytorch/intel_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import csv
import logging
import subprocess
from io import StringIO

# Device ID,Device Name,Vendor Name
cmd_discovery = "xpu-smi discovery --dump 1,2,3"
# Timestamp, DeviceId, GPU Utilization (%), GPU Memory Utilization (%), GPU Memory Used (MiB); N/A if read failed
cmd_dump = "xpu-smi dump -d X -m 0,5,18 -n 1"


def check_cmd(cmd):
out = None
try:
out = subprocess.check_output(cmd, shell=True, timeout=5, text=True)
except subprocess.TimeoutExpired:
logging.error("Timeout running %s", cmd)
except FileNotFoundError:
logging.error("xpu-smi command not found. Cannot collect Intel GPU metrics.")
except subprocess.CalledProcessError as e:
logging.error("Error running %s: %s", cmd, e)

buff = StringIO(out)
reader = csv.reader(buff)
reader = list(reader).copy()
if len(reader[-1]) <= 1:
reader = reader[:-1]
for line in reader:
for i in range(len(line)):
line[i] = line[i].strip()

return reader


def count_gpus():
cmd_out = check_cmd(cmd_discovery)
cnt = 0
for line in cmd_out:
if len(line) > 1:
cnt += 1
return cnt - 1


def list_gpu_info(num_gpus):
if num_gpus == 0:
return []
gpus = ",".join([str(i) for i in range(num_gpus)])
cmd_out = check_cmd(cmd_dump.replace("X", gpus))
if len(cmd_out) == 0:
raise Exception(
"Error reading from {}. Please also check input.".format(cmd_dump)
)
else:
return cmd_out
92 changes: 92 additions & 0 deletions examples/intel_extension_for_pytorch/intel_gpu_metric_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import argparse
import logging
import sys
import types
from builtins import str

from intel_gpu import list_gpu_info

from ts.metrics.dimension import Dimension
from ts.metrics.metric import Metric
from ts.metrics.process_memory_metric import check_process_mem_usage

intel_gpu_system_metrics = []
dimension = [Dimension("Level", "Host")]


def gpu_utilization(num_of_gpu):
"""
Collect gpu metrics.
:param num_of_gpu:
:return:
"""
if num_of_gpu <= 0:
return

info = list_gpu_info(num_of_gpu)
for line in info[1:]:
dimension_gpu = [
Dimension("Level", "Host"),
Dimension("device_id", int(line[1])),
]
if line[2] != "N/A":
intel_gpu_system_metrics.append(
Metric("GPUUtilization", float(line[2]), "percent", dimension_gpu)
)
if line[3] != "N/A":
intel_gpu_system_metrics.append(
Metric(
"GPUMemoryUtilization",
float(line[3]),
"percent",
dimension_gpu,
)
)
if line[4] != "N/A":
intel_gpu_system_metrics.append(
Metric("GPUMemoryUsed", float(line[4]), "MB", dimension_gpu)
)


def collect_all(mod, num_of_gpu):
"""
Collect all system metrics.
:param mod:
:param num_of_gpu:
:return:
"""

members = dir(mod)
for i in members:
value = getattr(mod, i)
if isinstance(value, types.FunctionType) and value.__name__ not in (
"collect_all",
"log_msg",
):
if value.__name__ == "gpu_utilization":
gpu_utilization(num_of_gpu)
else:
value()

for met in mod.system_metrics:
logging.info(str(met))

for met in intel_gpu_system_metrics:
logging.info(str(met))

logging.info("")


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument("--gpu", action="store", help="number of GPU", type=int)
arguments = parser.parse_args()

logging.basicConfig(stream=sys.stdout, format="%(message)s", level=logging.INFO)

collect_all(sys.modules["ts.metrics.system_metrics"], arguments.gpu)

check_process_mem_usage(sys.stdin)

0 comments on commit 1fc8bd0

Please sign in to comment.