diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py index fe14bb5751161..3ab572b03f85a 100644 --- a/pytorch_lightning/profiler/__init__.py +++ b/pytorch_lightning/profiler/__init__.py @@ -49,8 +49,58 @@ on_train_end | 5.449e-06 | 5.449e-06 +Autograd Profiling +------------------ +If you would like to focus your profiling on the PyTorch-specific components, you should use the autograd +profiler. This leverages the native `torch.autograd.profiler`_ context manager to accurately measure time +spent on PyTorch ops on both the CPU and GPU. + +.. _`torch.autograd.profiler`: https://pytorch.org/docs/stable/autograd.html#profiler + +.. code-block:: python + + profiler = AutogradProfiler() + trainer = Trainer(..., profiler=profiler) + +The profiler's results will be printed at the completion of a training `fit()`. This profiler +report can be quite long, so you can also specify an `output_filename` to save the report instead +of logging it to the output in your terminal. The output below shows the profiling for the action +`model_forward`. + +.. code-block:: python + + Profiler Report + + Profile stats for: model_forward + -------------------------- --------------- --------------- --------------- --------------- --------------- --------------- + Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls + -------------------------- --------------- --------------- --------------- --------------- --------------- --------------- + batch_norm 0.05% 4.000us 65.88% 5.365ms 5.365ms 1 + _batch_norm_impl_index 0.06% 5.000us 65.83% 5.361ms 5.361ms 1 + native_batch_norm 65.75% 5.355ms 65.75% 5.355ms 5.355ms 1 + addmm 13.03% 1.061ms 13.03% 1.061ms 530.500us 2 + dropout 0.10% 8.000us 8.53% 695.000us 695.000us 1 + mul 6.47% 527.000us 6.47% 527.000us 175.667us 3 + tanh 4.44% 362.000us 4.44% 362.000us 362.000us 1 + unsigned short 3.76% 306.000us 3.76% 306.000us 153.000us 2 + log_softmax 0.04% 3.000us 3.25% 265.000us 265.000us 1 + _log_softmax 3.22% 262.000us 3.22% 262.000us 262.000us 1 + bernoulli_ 1.71% 139.000us 1.71% 139.000us 139.000us 1 + div_ 0.52% 42.000us 0.52% 42.000us 42.000us 1 + nll_loss 0.04% 3.000us 0.33% 27.000us 27.000us 1 + view 0.32% 26.000us 0.32% 26.000us 26.000us 1 + nll_loss_forward 0.29% 24.000us 0.29% 24.000us 24.000us 1 + add 0.11% 9.000us 0.11% 9.000us 9.000us 1 + empty 0.05% 4.000us 0.05% 4.000us 2.000us 2 + empty_like 0.01% 1.000us 0.05% 4.000us 4.000us 1 + detach 0.04% 3.000us 0.04% 3.000us 1.000us 3 + -------------------------- --------------- --------------- --------------- --------------- --------------- --------------- + Self CPU time total: 8.144ms + + + Advanced Profiling --------------------- +------------------ If you want more information on the functions called during each event, you can use the `AdvancedProfiler`. This option uses Python's cProfiler_ to provide a report of time spent on *each* function called within your code. @@ -91,6 +141,11 @@ 60000 1.651 0.000 6.839 0.000 functional.py:42(to_tensor) 60000 0.260 0.000 5.734 0.000 transforms.py:167(__call__) + + +Profiling custom events +----------------------- + You can also reference this profiler in your LightningModule to profile specific actions of interest. If you don't want to always have the profiler turned on, you can optionally pass a `PassThroughProfiler` which will allow you to skip profiling without having to make any code changes. Each profiler has a @@ -116,11 +171,18 @@ def custom_processing_step(self, data): """ -from pytorch_lightning.profiler.profilers import SimpleProfiler, AdvancedProfiler, PassThroughProfiler, BaseProfiler +from pytorch_lightning.profiler.profilers import ( + SimpleProfiler, + AdvancedProfiler, + AutogradProfiler, + PassThroughProfiler, + BaseProfiler, +) __all__ = [ 'BaseProfiler', 'SimpleProfiler', 'AdvancedProfiler', + 'AutogradProfiler', 'PassThroughProfiler', ] diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index ac410b0231453..061a296060512 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -26,6 +26,7 @@ import fsspec import numpy as np +import torch from pytorch_lightning import _logger as log from pytorch_lightning.utilities.cloud_io import get_filesystem @@ -219,7 +220,8 @@ class AdvancedProfiler(BaseProfiler): """ This profiler uses Python's cProfiler to record more detailed information about time spent in each function call recorded during a given action. The output is quite - verbose and you should only use this if you want very detailed reports. + verbose and you should only use this if you want very detailed reports. This profiler + is most helpful when trying to identify code bottlenecks outside of your neural network. """ def __init__(self, output_filename: Optional[str] = None, line_count_restriction: float = 1.0): @@ -283,3 +285,120 @@ def __del__(self): """Close profiler's stream.""" if self.output_file: self.output_file.close() + + +class AutogradProfiler(BaseProfiler): + """ + This profiler uses Pytorch's torch.autograd.profiler.profile as a backend. This profiler + can record PyTorch operations on both CPU and GPU, recording the events of functions + being executed under the hood in C++. This profiler is most useful when trying to optimize + the forward and backward pass of your neural network. + + Caution! The `AutogradProfiler` doesn't currently work with dataloaders with workers + enabled (`num_workers > 0`). Please disable workers when using this profiler. + """ + + def __init__( + self, + output_filename: str = None, + profile_memory: bool = True, + use_cuda: bool = False, + row_limit: int = 20, + group_by_input_shapes: bool = False + ): + """ + Args: + output_filename: optionally save profile results to file instead of printing + to std out when training is finished. + profile_memory: Record memory usage. + use_cuda: Measure execution time of CUDA kernels. + row_limit: Limit the number of rows in a table, `0` is a special value that + removes the limit completely. + group_by_input_shapes: Include operator input shapes and group calls by shape. + """ + self.profile_memory = profile_memory + self.group_by_input_shapes = group_by_input_shapes + self.use_cuda = use_cuda + self.row_limit = row_limit + self.profiled_actions = {} + self.context_names = {} + self.profiler = None + # stack of currently running profilers + self.running_stack = [] + + self.output_fname = output_filename + self.output_file = open(self.output_fname, 'w') if self.output_fname else None + + streaming_out = [self.output_file.write] if self.output_file else [log.info] + super().__init__(output_streams=streaming_out) + + def _start(self, action_name: str) -> None: + self.profiler = torch.autograd.profiler.profile( + profile_memory=self.profile_memory, + use_cuda=self.use_cuda, + record_shapes=self.group_by_input_shapes + ) + self.profiler.__enter__() + + def start(self, action_name: str) -> None: + # stop the running profiler if any + if len(self.running_stack) > 0: + self._stop(self.running_stack[-1]) + self.running_stack.append(action_name) + + self.context_names[action_name] = "/".join(self.running_stack) + + self._start(action_name) + + def _stop(self, action_name: str) -> None: + self.profiler.__exit__( + exc_type=None, + exc_val=None, + exc_tb=None + ) + events = self.profiler.function_events + self.profiler = None + for name in self.running_stack: + if name not in self.profiled_actions: + self.profiled_actions[name] = events + else: + self.profiled_actions[name] += events + + def stop(self, action_name: str) -> None: + if len(self.running_stack) == 0 or self.running_stack[-1] != action_name: + raise ValueError( # pragma: no-cover + f"Attempting to stop recording an action ({action_name}) which was never started." + ) + self._stop(action_name) + self.running_stack.pop() + # restore running profiler + if len(self.running_stack) > 0: + self._start(self.running_stack[-1]) + + def summary(self) -> str: + output_string = f"{os.linesep}Profiler Report{os.linesep}" + + for name, events in self.profiled_actions.items(): + # next line is a workaround for a pytorch issue (fixed on master, still present + # on 1.7). Without it the code fails with `AssertionError: There is already a CPU + # parent event for detach` + events.populate_cpu_children = lambda: None + + report = events.key_averages(group_by_input_shapes=self.group_by_input_shapes).table( + sort_by=("cuda_time_total" if self.use_cuda else "cpu_time_total"), + row_limit=self.row_limit + ) + output_string += f"{os.linesep}Profile stats for {self.context_names[name]}{os.linesep}{report}" + + return output_string + + def describe(self): + """Logs a profile report after the conclusion of the training run.""" + super().describe() + if self.output_file: + self.output_file.flush() + + def __del__(self): + """Close profiler's stream.""" + if self.output_file: + self.output_file.close() diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 3bce379c1115c..6c82c35373825 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -4,8 +4,9 @@ import numpy as np import pytest +import torch -from pytorch_lightning.profiler import AdvancedProfiler, SimpleProfiler +from pytorch_lightning.profiler import AdvancedProfiler, AutogradProfiler, SimpleProfiler PROFILER_OVERHEAD_MAX_TOLERANCE = 0.0005 @@ -14,6 +15,11 @@ def _get_python_cprofile_total_duration(profile): return sum([x.inlinetime for x in profile.getstats()]) +def _get_pytorch_profiler_total_duration(events): + total_time = sum([e.cpu_time + e.cuda_time for e in events]) + return total_time / 1e6 # convert microseconds to seconds + + def _sleep_generator(durations): """ the profile_iterable method needs an iterable in which we can ensure that we're @@ -36,6 +42,15 @@ def advanced_profiler(tmpdir): return profiler +@pytest.fixture +def autograd_profiler(tmpdir): + profiler = AutogradProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) + return profiler + + +# ===================== +# Simple Profiler +# ===================== @pytest.mark.parametrize(["action", "expected"], [ pytest.param("a", [3, 1]), pytest.param("b", [2]), @@ -105,12 +120,16 @@ def test_simple_profiler_value_errors(simple_profiler): simple_profiler.stop(action) +# ===================== +# Advanced Profiler +# ===================== @pytest.mark.parametrize(["action", "expected"], [ pytest.param("a", [3, 1]), pytest.param("b", [2]), pytest.param("c", [1]) ]) def test_advanced_profiler_durations(advanced_profiler, action, expected): + """Ensure the reported durations are reasonably accurate.""" for duration in expected: with advanced_profiler.profile(action): @@ -149,9 +168,7 @@ def test_advanced_profiler_iterable_durations(advanced_profiler, action, expecte def test_advanced_profiler_overhead(advanced_profiler, n_iter=5): - """ - ensure that the profiler doesn't introduce too much overhead during training - """ + """Ensure that the profiler doesn't introduce too much overhead during training.""" for _ in range(n_iter): with advanced_profiler.profile("no-op"): pass @@ -163,9 +180,7 @@ def test_advanced_profiler_overhead(advanced_profiler, n_iter=5): def test_advanced_profiler_describe(tmpdir, advanced_profiler): - """ - ensure the profiler won't fail when reporting the summary - """ + """Ensure the profiler won't fail when reporting the summary.""" # record at least one event with advanced_profiler.profile("test"): pass @@ -184,3 +199,43 @@ def test_advanced_profiler_value_errors(advanced_profiler): advanced_profiler.start(action) advanced_profiler.stop(action) + + +# ===================== +# Autograd Profiler +# ===================== + +def test_autograd_profiler_overhead(autograd_profiler, n_iter=5): + """Ensure that the profiler doesn't introduce too much overhead during training.""" + for _ in range(n_iter): + with autograd_profiler.profile("no-op"): + a = torch.ones(42) + b = torch.abs(a) + c = a + b + + action_profile = autograd_profiler.profiled_actions["no-op"] + total_duration = _get_pytorch_profiler_total_duration(action_profile) + average_duration = total_duration / n_iter + assert average_duration < PROFILER_OVERHEAD_MAX_TOLERANCE + + +def test_autograd_profiler_describe(tmpdir, autograd_profiler): + """Ensure the profiler won't fail when reporting the summary.""" + with autograd_profiler.profile("test"): + pass + + # log to stdout and print to file + autograd_profiler.describe() + data = Path(autograd_profiler.output_fname).read_text() + assert len(data) > 0 + + +def test_autograd_profiler_value_errors(autograd_profiler): + """Ensure errors are raised where expected.""" + + action = "test" + with pytest.raises(ValueError): + autograd_profiler.stop(action) + + autograd_profiler.start(action) + autograd_profiler.stop(action)