forked from Lightning-AI/pytorch-lightning
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GPU Usage Logger (Lightning-AI#2932)
* GPU utilisation Callback * GPU utilisation Callback * Fixing style * Fixing style * Fixing CodeFactor: partial executable path * Fix a misspelling in the Class name
- Loading branch information
1 parent
4b4c7cc
commit a87c339
Showing
2 changed files
with
143 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
""" | ||
GPU Usage Logger | ||
==================== | ||
Log GPU memory and GPU usage during training | ||
""" | ||
|
||
from pytorch_lightning.callbacks.base import Callback | ||
import subprocess | ||
import os | ||
import time | ||
|
||
|
||
class GpuUsageLogger(Callback): | ||
r""" | ||
Automatically logs GPU memory and GPU usage during training stage. | ||
Args: | ||
memory_utilisation: Set to ``True`` to log used, free and percentage of memory | ||
utilisation at starts and ends of each step. Default: ``True``. | ||
From nvidia-smi --help-query-gpu | ||
memory.used = ```Total memory allocated by active contexts.``` | ||
memory.free = ```Total free memory.``` | ||
gpu_utilisation: Set to ``True`` to log percentage of GPU utilisation. | ||
at starts and ends of each step. Default: ``True``. | ||
intra_step_time: Set to ``True`` to log the time of each step. Default: ``False`` | ||
inter_step_time: Set to ``True`` to log the time between the end of one step | ||
and the start of the next. Default: ``False`` | ||
fan_speed: Set to ``False`` to log percentage of fan speed. Default: ``False``. | ||
temperature: Set to ``True`` to log the memory and gpu temperature in degrees C. | ||
Default: ``False`` | ||
Example:: | ||
>>> from pytorch_lightning import Trainer | ||
>>> from pytorch_lightning.callbacks import GpuUsageLogger | ||
>>> gpu_usage = GpuUsageLogger() | ||
>>> trainer = Trainer(callbacks=[gpu_usage]) | ||
Gpu usage is mainly based on nvidia-smi --query-gpu command. | ||
The description of the queries used here as appears in | ||
in ``nvidia-smi --help-query-gpu``: | ||
"fan.speed" | ||
```The fan speed value is the percent of maximum speed that the device's fan is currently | ||
intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended | ||
fan speed. If the fan is physically blocked and unable to spin, this output will not match | ||
the actual fan speed. Many parts do not report fan speeds because they rely on cooling via | ||
fans in the surrounding enclosure.``` | ||
"memory.used" | ||
```Total memory allocated by active contexts.``` | ||
"memory.free" | ||
```Total free memory.``` | ||
"utilization.gpu" | ||
```Percent of time over the past sample period during which one or more kernels was executing | ||
on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.``` | ||
"utilization.memory" | ||
```Percent of time over the past sample period during which global (device) memory was being | ||
read or written. The sample period may be between 1 second and 1/6 second depending on the | ||
product.``` | ||
"temperature.gpu" | ||
```Core GPU temperature. in degrees C.``` | ||
"temperature.memory" | ||
```HBM memory temperature. in degrees C.``` | ||
""" | ||
|
||
def __init__(self, memory_utilisation: bool = True, gpu_utilisation: bool = True, | ||
intra_step_time: bool = False, inter_step_time: bool = False, | ||
fan_speed: bool = False, temperature: bool = False): | ||
super(GpuUsageLogger).__init__() | ||
self.memory_utilisation = memory_utilisation | ||
self.gpu_utilisation = gpu_utilisation | ||
self.intra_step_time = intra_step_time | ||
self.inter_step_time = inter_step_time | ||
self.fan_speed = fan_speed | ||
self.temperature = temperature | ||
self.snap_intra_step_time = None | ||
self.snap_inter_step_time = None | ||
|
||
def on_batch_start(self, trainer, pl_module): | ||
if self.gpu_utilisation: | ||
self._log_gpu(trainer) | ||
if self.memory_utilisation: | ||
self._log_memory(trainer) | ||
|
||
if self.inter_step_time: | ||
# First log at beginning of second step | ||
if self.snap_inter_step_time: | ||
trainer.logger.log_metrics({'Batch_Time/inter_step (ms)': | ||
(time.time() - self.snap_inter_step_time) * 1000}, | ||
step=trainer.global_step) | ||
if self.intra_step_time: | ||
self.snap_intra_step_time = time.time() | ||
|
||
def on_batch_end(self, trainer, pl_module): | ||
if self.gpu_utilisation: | ||
self._log_gpu(trainer) | ||
if self.memory_utilisation: | ||
self._log_memory(trainer) | ||
|
||
if self.fan_speed: | ||
trainer.logger.log_metrics(self._get_gpu_stat("fan.speed", "%"), step=trainer.global_step) | ||
if self.temperature: | ||
trainer.logger.log_metrics(self._get_gpu_stat("temperature.gpu", "degrees C"), step=trainer.global_step) | ||
trainer.logger.log_metrics(self._get_gpu_stat("temperature.memory", "degrees C"), step=trainer.global_step) | ||
|
||
if self.inter_step_time: | ||
self.snap_inter_step_time = time.time() | ||
|
||
if self.intra_step_time: | ||
if self.snap_intra_step_time: | ||
trainer.logger.log_metrics({'Batch_Time/intra_step (ms)': | ||
(time.time() - self.snap_intra_step_time) * 1000}, | ||
step=trainer.global_step) | ||
|
||
def on_train_epoch_start(self, trainer, pl_module): | ||
self.snap_intra_step_time = None | ||
self.snap_inter_step_time = None | ||
|
||
@staticmethod | ||
def _get_gpu_stat(pitem: str, unit: str): | ||
result = subprocess.run(["/bin/nvidia-smi", f"--query-gpu={pitem}", "--format=csv,nounits,noheader"], | ||
encoding="utf-8", stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, # for backward compatibility with python version 3.6 | ||
check=True) | ||
try: | ||
gpu_usage = [float(x) for x in result.stdout.strip().split(os.linesep)] | ||
except ValueError: | ||
gpu_usage = [0] | ||
|
||
return {f"GPU_{pitem}/gpu_id_{index} ({unit})": usage for index, usage in enumerate(gpu_usage)} | ||
|
||
def _log_gpu(self, trainer): | ||
trainer.logger.log_metrics(self._get_gpu_stat("utilization.gpu", "%"), step=trainer.global_step) | ||
|
||
def _log_memory(self, trainer): | ||
trainer.logger.log_metrics(self._get_gpu_stat("memory.used", "MB"), step=trainer.global_step) | ||
trainer.logger.log_metrics(self._get_gpu_stat("memory.free", "MB"), step=trainer.global_step) | ||
trainer.logger.log_metrics(self._get_gpu_stat("utilization.memory", "%"), step=trainer.global_step) |