From 91645dacc92baee0120602f8850b66ba90f83935 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 5 Mar 2021 20:12:03 +0000 Subject: [PATCH] [bug] Fix Pytorch profiler with emit_nvtx (#6260) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * resolve bug * update changelog * Update tests/trainer/test_trainer.py * Update pytorch_lightning/profiler/profilers.py Co-authored-by: Jirka Borovec * resolve comments * resolve flake8 Co-authored-by: Carlos MocholĂ­ Co-authored-by: Jirka Borovec --- CHANGELOG.md | 6 + pytorch_lightning/profiler/profilers.py | 1 - pytorch_lightning/profiler/pytorch.py | 303 ++++++++++++++++++++++++ tests/special_tests.sh | 1 + tests/trainer/test_trainer.py | 32 ++- 5 files changed, 337 insertions(+), 6 deletions(-) create mode 100644 pytorch_lightning/profiler/pytorch.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 064408cab8762..24e629494f87f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +- Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/PyTorchLightning/pytorch-lightning/pull/6260)) + + +- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272)) + + ## [1.2.2] - 2021-03-02 ### Added diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index 2089943aa7593..673db3b87acff 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Profiler to check if there are any bottlenecks in your code.""" - import cProfile import inspect import io diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py new file mode 100644 index 0000000000000..88a33a3d367f8 --- /dev/null +++ b/pytorch_lightning/profiler/pytorch.py @@ -0,0 +1,303 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Profiler to check if there are any bottlenecks in your code.""" + +import inspect +import logging +import os +from typing import List, Optional + +import torch + +from pytorch_lightning.profiler.profilers import BaseProfiler +from pytorch_lightning.utilities import rank_zero_only +from pytorch_lightning.utilities.cloud_io import get_filesystem +from pytorch_lightning.utilities.distributed import rank_zero_warn +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +log = logging.getLogger(__name__) + + +class PyTorchProfiler(BaseProfiler): + + PROFILED_FUNCTIONS = ("training_step_and_backward", "validation_step", "test_step") + AVAILABLE_SORT_KEYS = ( + "cpu_time", + "cuda_time", + "cpu_time_total", + "cuda_time_total", + "cpu_memory_usage", + "cuda_memory_usage", + "self_cpu_memory_usage", + "self_cuda_memory_usage", + "count", + ) + + def __init__( + self, + output_filename: Optional[str] = None, + enabled: bool = True, + use_cuda: bool = False, + record_shapes: bool = False, + profile_memory: bool = False, + group_by_input_shapes: bool = False, + with_stack: bool = False, + use_kineto: bool = False, + use_cpu: bool = True, + emit_nvtx: bool = False, + export_to_chrome: bool = False, + path_to_export_trace: str = None, + row_limit: int = 20, + sort_by_key: Optional[str] = None, + profiled_functions: Optional[List] = None, + local_rank: Optional[int] = None, + ): + """ + This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of + different operators inside your model - both on the CPU and GPU + + Args: + + output_filename: optionally save profile results to file instead of printing + to std out when training is finished. When using ``ddp``, + each rank will stream the profiled operation to their own file + with the extension ``_{rank}.txt`` + + enabled: Setting this to False makes this context manager a no-op. + + use_cuda: Enables timing of CUDA events as well using the cudaEvent API. + Adds approximately 4us of overhead to each tensor operation. + + record_shapes: If shapes recording is set, information about input dimensions will be collected. + + profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0) + + group_by_input_shapes: Include operator input shapes and group calls by shape. + + with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0) + + use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0) + + use_cpu: use_kineto=True and can be used to lower the overhead + for GPU-only profiling (Introduced in PyTorch 1.8.0) + + emit_nvtx: Context manager that makes every autograd operation emit an NVTX range + Run:: + + nvprof --profile-from-start off -o trace_name.prof -- + + To visualize, you can either use:: + + nvvp trace_name.prof + torch.autograd.profiler.load_nvprof(path) + + export_to_chrome: Wether to export the sequence of profiled operators for Chrome. + It will generate a ``.json`` file which can be read by Chrome. + + path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``. + By default, it will be save where the file being is being run. + + row_limit: Limit the number of rows in a table, `0` is a special value that + removes the limit completely. + + sort_by_key: Keys to sort out profiled table + + profiled_functions: list of profiled functions which will create a context manager on. + Any other will be pass through. + + local_rank: When running in distributed setting, local_rank is used for each process + to write to their own file if `output_fname` is provided. + + Raises: + MisconfigurationException: + If arg ``sort_by_key`` is not present in ``AVAILABLE_SORT_KEYS``, or + if log file is not a ``.txt`` file. + ValueError: + If you attempt to stop recording an action which was never started. + """ + + self.profiled_actions = {} + self.enabled = enabled + self.profiled_functions = profiled_functions or self.PROFILED_FUNCTIONS + self.use_cuda = use_cuda + self.record_shapes = record_shapes + self.profile_memory = profile_memory + self.sort_by_key = sort_by_key or ("cuda_time_total" if self.use_cuda else "cpu_time_total") + self.with_stack = with_stack + self.group_by_input_shapes = group_by_input_shapes and record_shapes + self.use_kineto = use_kineto + self.use_cpu = use_cpu + self.row_limit = row_limit + self.emit_nvtx = emit_nvtx + self.export_to_chrome = export_to_chrome + self.path_to_export_trace = path_to_export_trace + + if export_to_chrome and path_to_export_trace is None: + rank_zero_warn( + "The exported trace would be save locally as `path_to_export_trace` is empty." + " Note: Each functions will generate its own traced file." + ) + + if self.sort_by_key not in self.AVAILABLE_SORT_KEYS: + raise MisconfigurationException( + f"Found sort_by_key: {sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. " + ) + + self.profiled_actions = {} + self.context_names = {} + self.running_stack = [] + self.profiler = None + + self.output_fname = output_filename + self.output_file = None + if local_rank is not None: + self.on_train_start(local_rank=local_rank) + self.on_train_start = super().on_train_start + + def on_train_start(self, local_rank: Optional[str] = None): + self.local_rank = local_rank + + # when logging to `log.info`, only perform profiling on rank 0 + if local_rank != 0 and self.output_fname is None: + self.wrap_functions_into_rank_zero_only() + + if self.output_fname: + if local_rank is not None: + if '.txt' not in self.output_fname: + raise MisconfigurationException("Log file should be .txt file.") + + self.output_fname = self.output_fname.replace(".txt", f"_{self.local_rank}.txt") + + fs = get_filesystem(self.output_fname) + self.output_file = fs.open(self.output_fname, "w") + + streaming_out = [self.output_file.write] if self.output_file else [log.info] + super().__init__(output_streams=streaming_out) + + def wrap_functions_into_rank_zero_only(self): + self.start = rank_zero_only(self.start) + self.stop = rank_zero_only(self.stop) + self.summary = rank_zero_only(self.summary) + self.describe = rank_zero_only(self.describe) + + def start(self, action_name: str) -> None: + if action_name not in self.profiled_functions: + return + + if len(self.running_stack) > 0: + self._stop(self.running_stack[-1]) + self.running_stack.append(action_name) + + self.context_names[action_name] = "/".join(self.running_stack) + + self._start(action_name) + + def _start(self, action_name: str) -> None: + if self.emit_nvtx: + self._parent_profiler = self._create_profiler(action_name, torch.cuda.profiler.profile, enter=True) + self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx) + else: + self._create_profiler(action_name, torch.autograd.profiler.profile) + + def _create_profiler(self, action_name, profiler, enter=True): + init_args = inspect.signature(profiler.__init__).parameters + profiler_args = {k: v for k, v in vars(self).items() if k in init_args} + pr = profiler(**profiler_args) + if enter: + out_pr = pr.__enter__() + if out_pr is not None: + pr = out_pr + self.profiler = pr + return self.profiler + + def _stop(self, action_name: str) -> None: + if self.profiler is None: + return + + self.profiler.__exit__(exc_type=None, exc_val=None, exc_tb=None) + + if isinstance(self.profiler, torch.autograd.profiler.emit_nvtx): + # when running ``emit_nvtx``, PyTorch requires 2 context manager. + # The parent_profiler is being closed too. + self._parent_profiler.__exit__(None, None, None) + return + + function_events = self.profiler.function_events + self.profiler = None + for name in self.running_stack: + if name not in self.profiled_actions: + self.profiled_actions[name] = function_events + else: + self.profiled_actions[name] += function_events + + def stop(self, action_name: str) -> None: + if action_name not in self.profiled_functions: + return + + if len(self.running_stack) == 0 or self.running_stack[-1] != action_name: + raise ValueError( # pragma: no-cover + f"Attempting to stop recording an action ({action_name}) which was never started." + ) + self._stop(action_name) + self.running_stack.pop() + # restore running profiler + if len(self.running_stack) > 0: + self._start(self.running_stack[-1]) + + def summary(self) -> str: + recorded_stats = {} + output_string = '' + local_rank = '0' if self.local_rank is None else self.local_rank + + if not self.enabled: + return output_string + + for action_name, function_events in self.profiled_actions.items(): + + # next line is a workaround for a pytorch issue (fixed on master, still present + # on 1.7). Without it the code fails with `AssertionError: There is already a CPU + # parent event for detach` + function_events.populate_cpu_children = lambda: None + + if self.export_to_chrome: + filename = f"{action_name}_{local_rank}_trace.json" + path_to_trace = filename if self.path_to_export_trace is None \ + else os.path.join(self.path_to_export_trace, filename) + function_events.export_chrome_trace(path_to_trace) + + if self.emit_nvtx: + return output_string + + else: + data = function_events.key_averages(group_by_input_shapes=self.group_by_input_shapes) + table = data.table(sort_by=self.sort_by_key, row_limit=self.row_limit) + recorded_stats[action_name] = table + + # log to standard out + output_string = f"{os.linesep}Profiler Report{os.linesep}" + for action, stats in recorded_stats.items(): + output_string += (f"{os.linesep}Profile stats for: {action} rank: {local_rank} {os.linesep}{stats}") + + return output_string + + def describe(self): + """Logs a profile report after the conclusion of the training run.""" + super().describe() + if self.output_file: + self.output_file.flush() + + def __del__(self): + """Close profiler's stream.""" + if self.output_file: + self.output_file.close() diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 472f7afda5e9e..a2373d05a42ef 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -35,3 +35,4 @@ python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_trainer_ python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model +nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_nested_emit_nvtx diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 23861cdd2563b..26a2e2e1f5204 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -220,8 +220,14 @@ def test_trainer_accumulate_grad_batches_zero_grad(tmpdir, accumulate_grad_batch @pytest.mark.parametrize( ["accumulate_grad_batches", "limit_train_batches"], [ - ({1: 2, 3: 4}, 1.0), - ({1: 2, 3: 4}, 0.5), # not to be divisible by accumulate_grad_batches on purpose + ({ + 1: 2, + 3: 4 + }, 1.0), + ({ + 1: 2, + 3: 4 + }, 0.5), # not to be divisible by accumulate_grad_batches on purpose (3, 1.0), (3, 0.8), # not to be divisible by accumulate_grad_batches on purpose (4, 1.0), @@ -239,9 +245,7 @@ def on_batch_start(self, *_): def on_batch_end(self, outputs, batch, batch_idx, *_): self.on_train_batch_start_end_dict = self.state_dict() for key in self.on_train_batch_start_end_dict.keys(): - equal = torch.equal( - self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key] - ) + equal = torch.equal(self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key]) if (batch_idx + 1) == self.trainer.num_training_batches: assert equal else: @@ -1587,6 +1591,22 @@ def test_pytorch_profiler_nested(tmpdir): assert pa[n] == expected_[n] +@RunIf(min_gpus=1, special=True) +def test_pytorch_profiler_nested_emit_nvtx(tmpdir): + """ + This test check emit_nvtx is correctly supported + """ + profiler = PyTorchProfiler(use_cuda=True, emit_nvtx=True) + + model = BoringModel() + trainer = Trainer( + fast_dev_run=True, + profiler=profiler, + gpus=1, + ) + trainer.fit(model) + + @pytest.mark.parametrize( ["limit_train_batches", "global_step", "num_training_batches", "current_epoch", "should_train"], [(0.2, 0, 0, 0, False), (0.5, 10, 2, 4, True)], @@ -1738,6 +1758,7 @@ def test_train_loop_system(tmpdir): ) class TestOptimizer(SGD): + def step(self, *args, **kwargs): called_methods.append("step") return super().step(*args, **kwargs) @@ -1747,6 +1768,7 @@ def zero_grad(self, *args, **kwargs): return super().zero_grad(*args, **kwargs) class TestModel(BoringModel): + def configure_optimizers(self): return TestOptimizer(self.parameters(), lr=0.1)