-
Notifications
You must be signed in to change notification settings - Fork 858
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Torchserve Metrics support for Intel GPUs enabled (#3141)
* Initial Changes * Add util functions that parse number of gpus and metric from each gpus. * Fix reading 3 GPU metrics from each device. To be tested on multi-gpu systems. * system_metrics.py - TS_IPEX_GPU_ENABLE flag * Final Commit * Fix data type in metric reading * Refactor gpu metric enabling as a custom collctor script. * Refactor metric enabling files * Fix lint changed file. * Fix more lint changed file. --------- Co-authored-by: Kanya-Mo <167922169+Kanya-Mo@users.noreply.github.com> Co-authored-by: Mo, Kanya <kanya.mo@intel.com> Co-authored-by: Ankith Gunapal <agunapal@ischool.Berkeley.edu>
- Loading branch information
1 parent
c7bbf2c
commit 1fc8bd0
Showing
2 changed files
with
146 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import csv | ||
import logging | ||
import subprocess | ||
from io import StringIO | ||
|
||
# Device ID,Device Name,Vendor Name | ||
cmd_discovery = "xpu-smi discovery --dump 1,2,3" | ||
# Timestamp, DeviceId, GPU Utilization (%), GPU Memory Utilization (%), GPU Memory Used (MiB); N/A if read failed | ||
cmd_dump = "xpu-smi dump -d X -m 0,5,18 -n 1" | ||
|
||
|
||
def check_cmd(cmd): | ||
out = None | ||
try: | ||
out = subprocess.check_output(cmd, shell=True, timeout=5, text=True) | ||
except subprocess.TimeoutExpired: | ||
logging.error("Timeout running %s", cmd) | ||
except FileNotFoundError: | ||
logging.error("xpu-smi command not found. Cannot collect Intel GPU metrics.") | ||
except subprocess.CalledProcessError as e: | ||
logging.error("Error running %s: %s", cmd, e) | ||
|
||
buff = StringIO(out) | ||
reader = csv.reader(buff) | ||
reader = list(reader).copy() | ||
if len(reader[-1]) <= 1: | ||
reader = reader[:-1] | ||
for line in reader: | ||
for i in range(len(line)): | ||
line[i] = line[i].strip() | ||
|
||
return reader | ||
|
||
|
||
def count_gpus(): | ||
cmd_out = check_cmd(cmd_discovery) | ||
cnt = 0 | ||
for line in cmd_out: | ||
if len(line) > 1: | ||
cnt += 1 | ||
return cnt - 1 | ||
|
||
|
||
def list_gpu_info(num_gpus): | ||
if num_gpus == 0: | ||
return [] | ||
gpus = ",".join([str(i) for i in range(num_gpus)]) | ||
cmd_out = check_cmd(cmd_dump.replace("X", gpus)) | ||
if len(cmd_out) == 0: | ||
raise Exception( | ||
"Error reading from {}. Please also check input.".format(cmd_dump) | ||
) | ||
else: | ||
return cmd_out |
92 changes: 92 additions & 0 deletions
92
examples/intel_extension_for_pytorch/intel_gpu_metric_collector.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import argparse | ||
import logging | ||
import sys | ||
import types | ||
from builtins import str | ||
|
||
from intel_gpu import list_gpu_info | ||
|
||
from ts.metrics.dimension import Dimension | ||
from ts.metrics.metric import Metric | ||
from ts.metrics.process_memory_metric import check_process_mem_usage | ||
|
||
intel_gpu_system_metrics = [] | ||
dimension = [Dimension("Level", "Host")] | ||
|
||
|
||
def gpu_utilization(num_of_gpu): | ||
""" | ||
Collect gpu metrics. | ||
:param num_of_gpu: | ||
:return: | ||
""" | ||
if num_of_gpu <= 0: | ||
return | ||
|
||
info = list_gpu_info(num_of_gpu) | ||
for line in info[1:]: | ||
dimension_gpu = [ | ||
Dimension("Level", "Host"), | ||
Dimension("device_id", int(line[1])), | ||
] | ||
if line[2] != "N/A": | ||
intel_gpu_system_metrics.append( | ||
Metric("GPUUtilization", float(line[2]), "percent", dimension_gpu) | ||
) | ||
if line[3] != "N/A": | ||
intel_gpu_system_metrics.append( | ||
Metric( | ||
"GPUMemoryUtilization", | ||
float(line[3]), | ||
"percent", | ||
dimension_gpu, | ||
) | ||
) | ||
if line[4] != "N/A": | ||
intel_gpu_system_metrics.append( | ||
Metric("GPUMemoryUsed", float(line[4]), "MB", dimension_gpu) | ||
) | ||
|
||
|
||
def collect_all(mod, num_of_gpu): | ||
""" | ||
Collect all system metrics. | ||
:param mod: | ||
:param num_of_gpu: | ||
:return: | ||
""" | ||
|
||
members = dir(mod) | ||
for i in members: | ||
value = getattr(mod, i) | ||
if isinstance(value, types.FunctionType) and value.__name__ not in ( | ||
"collect_all", | ||
"log_msg", | ||
): | ||
if value.__name__ == "gpu_utilization": | ||
gpu_utilization(num_of_gpu) | ||
else: | ||
value() | ||
|
||
for met in mod.system_metrics: | ||
logging.info(str(met)) | ||
|
||
for met in intel_gpu_system_metrics: | ||
logging.info(str(met)) | ||
|
||
logging.info("") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--gpu", action="store", help="number of GPU", type=int) | ||
arguments = parser.parse_args() | ||
|
||
logging.basicConfig(stream=sys.stdout, format="%(message)s", level=logging.INFO) | ||
|
||
collect_all(sys.modules["ts.metrics.system_metrics"], arguments.gpu) | ||
|
||
check_process_mem_usage(sys.stdin) |