From ad00b975781e71570e94ef47347db875951e90d6 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 18 Jan 2021 19:45:25 +0000
Subject: [PATCH 01/28] add profiler

---
 pytorch_lightning/profiler/__init__.py        |   9 +-
 pytorch_lightning/profiler/profilers.py       | 104 ++++++++++++++++++
 .../trainer/connectors/profiler_connector.py  |   9 +-
 pytorch_lightning/trainer/evaluation_loop.py  |   6 +-
 pytorch_lightning/trainer/training_loop.py    |   5 +-
 pytorch_lightning/utilities/__init__.py       |   1 +
 pytorch_lightning/utilities/imports.py        |   1 +
 tests/trainer/test_trainer.py                 |  25 ++++-
 8 files changed, 153 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index c9ea6eb8ebaf6..329dff1e64e78 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -116,11 +116,18 @@ def custom_processing_step(self, data):
 
 """
 
-from pytorch_lightning.profiler.profilers import AdvancedProfiler, BaseProfiler, PassThroughProfiler, SimpleProfiler
+from pytorch_lightning.profiler.profilers import (
+    AdvancedProfiler,
+    BaseProfiler,
+    PassThroughProfiler,
+    PytorchProfiler,
+    SimpleProfiler,
+)
 
 __all__ = [
     'BaseProfiler',
     'SimpleProfiler',
     'AdvancedProfiler',
     'PassThroughProfiler',
+    "PytorchProfiler",
 ]
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 377ebfb7a51d5..3b44464bcdafa 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -25,9 +25,11 @@
 from typing import Optional, Union
 
 import numpy as np
+import torch
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities.cloud_io import get_filesystem
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class BaseProfiler(ABC):
@@ -282,3 +284,105 @@ def __del__(self):
         """Close profiler's stream."""
         if self.output_file:
             self.output_file.close()
+
+
+class PytorchProfiler(BaseProfiler):
+    """
+    This profiler uses PyTorch's Autograd Profiler and let's you inspect the cost of
+    different operators inside your model - both on the CPU and GPU
+    """
+
+    PROFILED_FUNCTIONS = ["training_step", "validation_step", "test_step"]
+
+    def __init__(self, output_filename: Optional[str] = None,
+                 enabled=True,
+                 use_cuda=False,
+                 record_shapes=True,
+                 profile_memory=True,
+                 with_stack=True,
+                 sort_by_key: str = "self_cuda_memory_usage"):
+        """
+        Args:
+            output_filename: optionally save profile results to file instead of printing
+                to std out when training is finished.
+            line_count_restriction: this can be used to limit the number of functions
+                reported for each action. either an integer (to select a count of lines),
+                or a decimal fraction between 0.0 and 1.0 inclusive (to select a percentage of lines)
+        """
+        self.profiled_actions = {}
+        self.enabled = enabled
+        self.use_cuda = use_cuda
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.sort_by_key = sort_by_key
+        if self.sort_by_key not in self.available_sort_by_keys:
+            raise MisconfigurationException(
+                f"Found sort_by_key: {sort_by_key}. Should be within {self.available_sort_by_keys}. ")
+
+        self.output_fname = output_filename
+        self.output_file = None
+        if self.output_fname:
+            fs = get_filesystem(self.output_fname)
+            self.output_file = fs.open(self.output_fname, "w")
+
+        streaming_out = [self.output_file.write] if self.output_file else [log.info]
+        super().__init__(output_streams=streaming_out)
+
+    def start(self, action_name: str) -> None:
+        if action_name not in self.profiled_actions and action_name in self.PROFILED_FUNCTIONS:
+            self.profiled_actions[action_name] = torch.autograd.profiler.profile(
+                enabled=self.enabled,
+                use_cuda=self.use_cuda,
+                record_shapes=self.record_shapes,
+                profile_memory=self.profile_memory).__enter__()
+
+    def stop(self, action_name: str) -> None:
+        if action_name in self.PROFILED_FUNCTIONS:
+            pr = self.profiled_actions.get(action_name)
+            if pr is None:
+                raise ValueError(  # pragma: no-cover
+                    f"Attempting to stop recording an action ({action_name}) which was never started."
+                )
+            # todo: Find a better solution
+            try:
+                _ = pr.__exit__(None, None, None)
+            except RuntimeError as e:
+                if "Expected debug info of type 2" in str(e):
+                    pass
+                else:
+                    raise RuntimeError(str(e))
+
+    def summary(self) -> str:
+        recorded_stats = {}
+        for action_name, pr in self.profiled_actions.items():
+            table = self.profiled_actions[action_name].key_averages().table(sort_by=self.sort_by_key)
+            recorded_stats[action_name] = table
+
+        # log to standard out
+        output_string = f"{os.linesep}Profiler Report{os.linesep}"
+        for action, stats in recorded_stats.items():
+            output_string += (
+                f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
+            )
+
+        return output_string
+
+    def describe(self):
+        """Logs a profile report after the conclusion of the training run."""
+        super().describe()
+        if self.output_file:
+            self.output_file.flush()
+
+    def __del__(self):
+        """Close profiler's stream."""
+        if self.output_file:
+            self.output_file.close()
+
+    @property
+    def available_sort_by_keys(self):
+        return [
+            "cpu_time", "cuda_time", "cpu_time_total",
+            "cuda_time_total", "cpu_memory_usage", "cuda_memory_usage",
+            "self_cpu_memory_usage", "self_cuda_memory_usage", "count"
+        ]
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index 3ecc168231b38..e2992a82bbcf2 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -14,13 +14,20 @@
 
 from typing import Union
 
-from pytorch_lightning.profiler import BaseProfiler, PassThroughProfiler, SimpleProfiler, AdvancedProfiler
+from pytorch_lightning.profiler import (
+    AdvancedProfiler,
+    BaseProfiler,
+    PassThroughProfiler,
+    PytorchProfiler,
+    SimpleProfiler,
+)
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 PROFILERS = {
     "simple": SimpleProfiler,
     "advanced": AdvancedProfiler,
+    "pytorch": PytorchProfiler,
 }
 
 
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index a8fa9f43684ca..1a334149d9292 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -171,10 +171,12 @@ def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx):
         # run actual test step
         if self.testing:
             model_ref._current_fx_name = "test_step"
-            output = self.trainer.accelerator_backend.test_step(args)
+            with self.trainer.profiler.profile("test_step"):
+                output = self.trainer.accelerator_backend.test_step(args)
         else:
             model_ref._current_fx_name = "validation_step"
-            output = self.trainer.accelerator_backend.validation_step(args)
+            with self.trainer.profiler.profile("validation_step"):
+                output = self.trainer.accelerator_backend.validation_step(args)
 
         # capture any logged information
         self.trainer.logger_connector.cache_logged_metrics()
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 6b49dc63f52b4..1b07634908a7e 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -24,7 +24,7 @@
 from pytorch_lightning.core.step_result import EvalResult, Result
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.trainer.supporters import Accumulator, TensorRunningAccum
-from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, parsing, DeviceType
+from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType, parsing
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import recursive_detach
@@ -339,7 +339,8 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
             # manually capture logged metrics
             model_ref._current_fx_name = 'training_step'
             model_ref._results = Result()
-            training_step_output = self.trainer.accelerator_backend.training_step(args)
+            with self.trainer.profiler.profile("training_step"):
+                training_step_output = self.trainer.accelerator_backend.training_step(args)
             self.trainer.logger_connector.cache_logged_metrics()
 
             self._check_training_step_output(training_step_output)
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 0a5ed04eb72a3..4b48e6595be6e 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -34,6 +34,7 @@
     _module_available,
     _NATIVE_AMP_AVAILABLE,
     _OMEGACONF_AVAILABLE,
+    _PYTORCH_GREATER_EQUAL_1_7_0,
     _RPC_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index acdebfbf239e4..775c683b92bef 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -53,3 +53,4 @@ def _module_available(module_path: str) -> bool:
 _GROUP_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.group')
 _FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and LooseVersion(torch.__version__) >= LooseVersion("1.6.0")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
+_PYTORCH_GREATER_EQUAL_1_7_0 = LooseVersion(torch.__version__) >= LooseVersion("1.7.0")
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 97785d9e61a86..75170c89947c3 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -33,7 +33,7 @@
 from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _PYTORCH_GREATER_EQUAL_1_7_0
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel, EvalModelTemplate
@@ -1421,6 +1421,7 @@ def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, l
     ('simple', SimpleProfiler),
     ('Simple', SimpleProfiler),
     ('advanced', AdvancedProfiler),
+    ('pytorch', AdvancedProfiler),
 ])
 def test_trainer_profiler_correct_args(profiler, expected):
     kwargs = {'profiler': profiler} if profiler is not None else {}
@@ -1441,3 +1442,25 @@ def test_trainer_profiler_incorrect_arg_type(profiler):
                        match=r"Only None, bool, str and subclasses of `BaseProfiler`"
                              r" are valid values for `Trainer`'s `profiler` parameter. *"):
         Trainer(profiler=profiler)
+
+
+# @pytest.mark.skipif(not _PYTORCH_GREATER_EQUAL_1_7_0, reason='test needs PyTorch 1.7+')
+def test_pytorch_profiler(tmpdir):
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            return {"loss": loss}
+
+    model = TestModel()
+
+    limit_train_batches = 2
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=2,
+        max_epochs=1,
+        profiler='pytorch'
+    )
+
+    trainer.fit(model)

From cfae67b4253db7761ade48dec6dd14fd353117c8 Mon Sep 17 00:00:00 2001
From: Ubuntu <thomas@grid.ai>
Date: Mon, 18 Jan 2021 20:18:52 +0000
Subject: [PATCH 02/28] add profiler

---
 pytorch_lightning/profiler/profilers.py       | 50 +++++++++++--------
 .../trainer/connectors/profiler_connector.py  |  6 ++-
 pytorch_lightning/utilities/__init__.py       |  2 +-
 pytorch_lightning/utilities/imports.py        |  2 +-
 .../test_train_loop_logging_1_0.py            |  1 +
 tests/trainer/test_trainer.py                 |  4 +-
 6 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 3b44464bcdafa..0a14aa27f8270 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -30,6 +30,7 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities import rank_zero_only
 
 
 class BaseProfiler(ABC):
@@ -294,28 +295,31 @@ class PytorchProfiler(BaseProfiler):
 
     PROFILED_FUNCTIONS = ["training_step", "validation_step", "test_step"]
 
-    def __init__(self, output_filename: Optional[str] = None,
+    def __init__(self, 
+                 output_filename: Optional[str] = None,
                  enabled=True,
-                 use_cuda=False,
+                 use_cuda=True,
                  record_shapes=True,
                  profile_memory=True,
-                 with_stack=True,
+                 group_by_input_shape=True,
                  sort_by_key: str = "self_cuda_memory_usage"):
         """
         Args:
             output_filename: optionally save profile results to file instead of printing
                 to std out when training is finished.
-            line_count_restriction: this can be used to limit the number of functions
-                reported for each action. either an integer (to select a count of lines),
-                or a decimal fraction between 0.0 and 1.0 inclusive (to select a percentage of lines)
+            enabled: Setting this to False makes this context manager a no-op. Default: True
+            use_cuda: Enables timing of CUDA events as well using the cudaEvent API. 
+                Adds approximately 4us of overhead to each tensor operation. Default: True
+            record_shapes:  If shapes recording is set, information about input dimensions will be collected.
+            profile_memory: Whether to report memory usage, default: True
         """
         self.profiled_actions = {}
         self.enabled = enabled
         self.use_cuda = use_cuda
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
-        self.with_stack = with_stack
         self.sort_by_key = sort_by_key
+        self.group_by_input_shape = group_by_input_shape and record_shapes
         if self.sort_by_key not in self.available_sort_by_keys:
             raise MisconfigurationException(
                 f"Found sort_by_key: {sort_by_key}. Should be within {self.available_sort_by_keys}. ")
@@ -330,6 +334,8 @@ def __init__(self, output_filename: Optional[str] = None,
         super().__init__(output_streams=streaming_out)
 
     def start(self, action_name: str) -> None:
+        # PyTorch profiler doesn't seem to work with multiple processes
+        self.enabled = os.getenv("LOCAL_RANK", None) is None
         if action_name not in self.profiled_actions and action_name in self.PROFILED_FUNCTIONS:
             self.profiled_actions[action_name] = torch.autograd.profiler.profile(
                 enabled=self.enabled,
@@ -338,13 +344,14 @@ def start(self, action_name: str) -> None:
                 profile_memory=self.profile_memory).__enter__()
 
     def stop(self, action_name: str) -> None:
-        if action_name in self.PROFILED_FUNCTIONS:
+        if action_name in self.PROFILED_FUNCTIONS and self.enabled:
             pr = self.profiled_actions.get(action_name)
             if pr is None:
                 raise ValueError(  # pragma: no-cover
                     f"Attempting to stop recording an action ({action_name}) which was never started."
                 )
-            # todo: Find a better solution
+            
+            # todo: Find a better solution to exit context manager
             try:
                 _ = pr.__exit__(None, None, None)
             except RuntimeError as e:
@@ -355,18 +362,21 @@ def stop(self, action_name: str) -> None:
 
     def summary(self) -> str:
         recorded_stats = {}
-        for action_name, pr in self.profiled_actions.items():
-            table = self.profiled_actions[action_name].key_averages().table(sort_by=self.sort_by_key)
-            recorded_stats[action_name] = table
-
-        # log to standard out
-        output_string = f"{os.linesep}Profiler Report{os.linesep}"
-        for action, stats in recorded_stats.items():
-            output_string += (
-                f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
-            )
+        if self.enabled:
+            for action_name, pr in self.profiled_actions.items():
+                table = self.profiled_actions[action_name].key_averages(
+                    group_by_input_shape=self.group_by_input_shape).table(sort_by=self.sort_by_key)
+                recorded_stats[action_name] = table
+
+            # log to standard out
+            output_string = f"{os.linesep}Profiler Report{os.linesep}"
+            for action, stats in recorded_stats.items():
+                output_string += (
+                    f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
+                )
 
-        return output_string
+            return output_string
+        return ''
 
     def describe(self):
         """Logs a profile report after the conclusion of the training run."""
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index e2992a82bbcf2..0c2dfec93715a 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -21,15 +21,17 @@
     PytorchProfiler,
     SimpleProfiler,
 )
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities import rank_zero_warn, _PYTORCH_GREATER_EQUAL_1_6_0
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 PROFILERS = {
     "simple": SimpleProfiler,
     "advanced": AdvancedProfiler,
-    "pytorch": PytorchProfiler,
 }
 
+if _PYTORCH_GREATER_EQUAL_1_6_0:
+    PROFILERS["pytorch"] = PytorchProfiler
+
 
 class ProfilerConnector:
 
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 4b48e6595be6e..fe21849d2962d 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -34,7 +34,7 @@
     _module_available,
     _NATIVE_AMP_AVAILABLE,
     _OMEGACONF_AVAILABLE,
-    _PYTORCH_GREATER_EQUAL_1_7_0,
+    _PYTORCH_GREATER_EQUAL_1_6_0,
     _RPC_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 775c683b92bef..09a3d36938bce 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -53,4 +53,4 @@ def _module_available(module_path: str) -> bool:
 _GROUP_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.group')
 _FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and LooseVersion(torch.__version__) >= LooseVersion("1.6.0")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
-_PYTORCH_GREATER_EQUAL_1_7_0 = LooseVersion(torch.__version__) >= LooseVersion("1.7.0")
+_PYTORCH_GREATER_EQUAL_1_6_0 = LooseVersion(torch.__version__) >= LooseVersion("1.6.0")
diff --git a/tests/trainer/logging_process/test_train_loop_logging_1_0.py b/tests/trainer/logging_process/test_train_loop_logging_1_0.py
index f418db2bd72a5..514bfb49ec79a 100644
--- a/tests/trainer/logging_process/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_process/test_train_loop_logging_1_0.py
@@ -740,6 +740,7 @@ def validation_step(self, batch, batch_idx):
         weights_summary=None,
         accelerator="ddp",
         gpus=2,
+        profiler="pytorch"
     )
     trainer.fit(model)
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 75170c89947c3..7d28afc6b0093 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -33,7 +33,7 @@
 from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _PYTORCH_GREATER_EQUAL_1_7_0
+from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _PYTORCH_GREATER_EQUAL_1_6_0
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel, EvalModelTemplate
@@ -1444,7 +1444,7 @@ def test_trainer_profiler_incorrect_arg_type(profiler):
         Trainer(profiler=profiler)
 
 
-# @pytest.mark.skipif(not _PYTORCH_GREATER_EQUAL_1_7_0, reason='test needs PyTorch 1.7+')
+# @pytest.mark.skipif(not _PYTORCH_GREATER_EQUAL_1_6_0, reason='test needs PyTorch 1.7+')
 def test_pytorch_profiler(tmpdir):
     class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):

From 5931c18d0268995e0c523c8acf3b609f9883524e Mon Sep 17 00:00:00 2001
From: Ubuntu <thomas@grid.ai>
Date: Tue, 19 Jan 2021 10:24:51 +0000
Subject: [PATCH 03/28] update

---
 pytorch_lightning/profiler/__init__.py        |  65 +++++++++++
 pytorch_lightning/profiler/profilers.py       | 108 ++++++++++++++----
 .../trainer/connectors/profiler_connector.py  |   7 +-
 pytorch_lightning/utilities/__init__.py       |   1 -
 pytorch_lightning/utilities/imports.py        |   3 +-
 tests/trainer/test_trainer.py                 |  17 ++-
 6 files changed, 164 insertions(+), 37 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 329dff1e64e78..e3339d65e80a6 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -97,6 +97,71 @@
 method `profile()` which returns a context handler. Simply pass in the name of your action that you want
 to track and the profiler will record performance for code executed within this context.
 
+.. code-block:: python
+
+    from pytorch_lightning.profiler import Profiler, PassThroughProfiler
+
+    class MyModel(LightningModule):
+        def __init__(self, profiler=None):
+            self.profiler = profiler or PassThroughProfiler()
+
+        def custom_processing_step(self, data):
+            with profiler.profile('my_custom_action'):
+                # custom processing step
+            return data
+
+    profiler = Profiler()
+    model = MyModel(profiler)
+    trainer = Trainer(profiler=profiler, max_epochs=1)
+
+
+PyTorch Profiling
+--------------------
+
+Autograd includes a profiler that lets you inspect the cost of different operators inside your model - both on the CPU and GPU. 
+
+.. _cProfiler: https://docs.python.org/3/library/profile.html#module-cProfile
+
+.. code-block:: python
+
+    trainer = Trainer(..., profiler="advanced")
+
+    or
+
+    profiler = AdvancedProfiler()
+    trainer = Trainer(..., profiler=profiler)
+
+The profiler's results will be printed at the completion of a training `fit()`. This profiler
+report can be quite long, so you can also specify an `output_filename` to save the report instead
+of logging it to the output in your terminal. The output below shows the profiling for the action
+`get_train_batch`.
+
+.. code-block:: python
+
+    Profiler Report
+
+    Profile stats for: get_train_batch
+            4869394 function calls (4863767 primitive calls) in 18.893 seconds
+    Ordered by: cumulative time
+    List reduced from 76 to 10 due to restriction <10>
+    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+    3752/1876    0.011    0.000   18.887    0.010 {built-in method builtins.next}
+        1876     0.008    0.000   18.877    0.010 dataloader.py:344(__next__)
+        1876     0.074    0.000   18.869    0.010 dataloader.py:383(_next_data)
+        1875     0.012    0.000   18.721    0.010 fetch.py:42(fetch)
+        1875     0.084    0.000   18.290    0.010 fetch.py:44(<listcomp>)
+        60000    1.759    0.000   18.206    0.000 mnist.py:80(__getitem__)
+        60000    0.267    0.000   13.022    0.000 transforms.py:68(__call__)
+        60000    0.182    0.000    7.020    0.000 transforms.py:93(__call__)
+        60000    1.651    0.000    6.839    0.000 functional.py:42(to_tensor)
+        60000    0.260    0.000    5.734    0.000 transforms.py:167(__call__)
+
+You can also reference this profiler in your LightningModule to profile specific actions of interest.
+If you don't want to always have the profiler turned on, you can optionally pass a `PassThroughProfiler`
+which will allow you to skip profiling without having to make any code changes. Each profiler has a
+method `profile()` which returns a context handler. Simply pass in the name of your action that you want
+to track and the profiler will record performance for code executed within this context.
+
 .. code-block:: python
 
     from pytorch_lightning.profiler import Profiler, PassThroughProfiler
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 0a14aa27f8270..6b0f68974ece5 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -16,6 +16,7 @@
 
 import cProfile
 import io
+import inspect
 import os
 import pstats
 import time
@@ -299,10 +300,16 @@ def __init__(self,
                  output_filename: Optional[str] = None,
                  enabled=True,
                  use_cuda=True,
-                 record_shapes=True,
-                 profile_memory=True,
+                 record_shapes=False,
+                 profile_memory=False,
                  group_by_input_shape=True,
-                 sort_by_key: str = "self_cuda_memory_usage"):
+                 with_stack=True,
+                 use_kineto=False,
+                 use_cpu = True,
+                 emit_nvtx=False,
+                 export_to_chrome=False,
+                 path_to_export_trace=None,
+                 sort_by_key: str = "cpu_time_total"):
         """
         Args:
             output_filename: optionally save profile results to file instead of printing
@@ -311,15 +318,32 @@ def __init__(self,
             use_cuda: Enables timing of CUDA events as well using the cudaEvent API. 
                 Adds approximately 4us of overhead to each tensor operation. Default: True
             record_shapes:  If shapes recording is set, information about input dimensions will be collected.
-            profile_memory: Whether to report memory usage, default: True
-        """
+            profile_memory: Whether to report memory usage, default: True (1.6.0)
+            with_stack: record source information (file and line number) for the ops (1.7.0)
+            use_kineto: experimental support for Kineto profiler (1.8.0)
+            use_cpu: use_kineto=True and can be used to lower the overhead for GPU-only profiling (1.8.0)
+            emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
+                * Run: nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+                To visualize, you can either use:
+                    * nvvp trace_name.prof
+                    * torch.autograd.profiler.load_nvprof(path)
+            export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
+            sort_by_key: Keys to sort out profiled table
+            path_to_export_trace: Path to exported traces. By default, it will be save where the file being is being run.
+        """ 
         self.profiled_actions = {}
         self.enabled = enabled
         self.use_cuda = use_cuda
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
         self.sort_by_key = sort_by_key
+        self.with_stack = with_stack
         self.group_by_input_shape = group_by_input_shape and record_shapes
+        self.use_kineto = use_kineto
+        self.use_cpu = use_cpu
+        self.emit_nvtx = emit_nvtx
+        self.export_to_chrome = export_to_chrome
+        self.path_to_export_trace = path_to_export_trace
         if self.sort_by_key not in self.available_sort_by_keys:
             raise MisconfigurationException(
                 f"Found sort_by_key: {sort_by_key}. Should be within {self.available_sort_by_keys}. ")
@@ -334,39 +358,73 @@ def __init__(self,
         super().__init__(output_streams=streaming_out)
 
     def start(self, action_name: str) -> None:
-        # PyTorch profiler doesn't seem to work with multiple processes
+        # PyTorch Profiler doesn't seem to work with multiple processes
+        # Disable Profiler.
         self.enabled = os.getenv("LOCAL_RANK", None) is None
         if action_name not in self.profiled_actions and action_name in self.PROFILED_FUNCTIONS:
-            self.profiled_actions[action_name] = torch.autograd.profiler.profile(
-                enabled=self.enabled,
-                use_cuda=self.use_cuda,
-                record_shapes=self.record_shapes,
-                profile_memory=self.profile_memory).__enter__()
+            self.profiled_actions[action_name] = []
+            if self.emit_nvtx:
+                self._create_profiler(action_name, torch.cuda.profiler.profile, enter=False)
+                # warmup
+                x = torch.rand(100, 100, device='cuda')
+                temp = x * x
+                self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx)
+            else:
+                self._create_profiler(action_name, torch.autograd.profiler.profile)
+
+    def _create_profiler(self, action_name, profiler, enter=False):
+        init_args = inspect.signature(profiler.__init__).parameters
+        profiler_args = {
+            k: v for k, v in vars(self).items() if k in init_args
+        }
+        profiler = profiler(**profiler_args)
+        if enter:
+            profiler = profiler.__enter__()
+        self.profiled_actions[action_name].append(profiler)
 
     def stop(self, action_name: str) -> None:
         if action_name in self.PROFILED_FUNCTIONS and self.enabled:
-            pr = self.profiled_actions.get(action_name)
-            if pr is None:
+            profilers = self.profiled_actions.get(action_name)
+            if not profilers:
                 raise ValueError(  # pragma: no-cover
                     f"Attempting to stop recording an action ({action_name}) which was never started."
                 )
-            
-            # todo: Find a better solution to exit context manager
-            try:
-                _ = pr.__exit__(None, None, None)
-            except RuntimeError as e:
-                if "Expected debug info of type 2" in str(e):
-                    pass
-                else:
-                    raise RuntimeError(str(e))
+            else:
+                for pr in profilers[::-1]:
+                    self._handle_exit(pr)
+
+    def _handle_exit(self, pr):
+        # todo: Find a better solution to exit context manager
+        if pr is None:
+            return
+        try:
+            _ = pr.__exit__(None, None, None)
+        except RuntimeError as e:
+            if "Expected debug info of type 2" in str(e):
+                pass
+            elif "can't disable profiler when it's not running" in str(e):
+                pass
+            elif "generator didn't stop" in str(e):
+                pass
+            else:
+                raise RuntimeError(str(e))        
 
     def summary(self) -> str:
         recorded_stats = {}
         if self.enabled:
             for action_name, pr in self.profiled_actions.items():
-                table = self.profiled_actions[action_name].key_averages(
-                    group_by_input_shape=self.group_by_input_shape).table(sort_by=self.sort_by_key)
-                recorded_stats[action_name] = table
+                pr = pr[-1]
+                if self.export_to_chrome:
+                    filename = f"{action_name}_trace.json"
+                    path_to_trace = filename if self.path_to_export_trace is None \
+                        else os.path.join(self.path_to_export_trace, filename)
+                    pr.export_chrome_trace(path_to_trace)
+                if self.emit_nvtx:
+                    return ""
+                else:
+                    table = pr.key_averages(
+                        group_by_input_shape=self.group_by_input_shape).table(sort_by=self.sort_by_key)
+                    recorded_stats[action_name] = table
 
             # log to standard out
             output_string = f"{os.linesep}Profiler Report{os.linesep}"
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index 0c2dfec93715a..b7f333626a152 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -21,18 +21,15 @@
     PytorchProfiler,
     SimpleProfiler,
 )
-from pytorch_lightning.utilities import rank_zero_warn, _PYTORCH_GREATER_EQUAL_1_6_0
+from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 PROFILERS = {
     "simple": SimpleProfiler,
     "advanced": AdvancedProfiler,
+    "pytorch": PytorchProfiler
 }
 
-if _PYTORCH_GREATER_EQUAL_1_6_0:
-    PROFILERS["pytorch"] = PytorchProfiler
-
-
 class ProfilerConnector:
 
     def __init__(self, trainer):
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index fe21849d2962d..0a5ed04eb72a3 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -34,7 +34,6 @@
     _module_available,
     _NATIVE_AMP_AVAILABLE,
     _OMEGACONF_AVAILABLE,
-    _PYTORCH_GREATER_EQUAL_1_6_0,
     _RPC_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 09a3d36938bce..19493f0619a79 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -52,5 +52,4 @@ def _module_available(module_path: str) -> bool:
 _RPC_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.rpc')
 _GROUP_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.group')
 _FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and LooseVersion(torch.__version__) >= LooseVersion("1.6.0")
-_BOLTS_AVAILABLE = _module_available('pl_bolts')
-_PYTORCH_GREATER_EQUAL_1_6_0 = LooseVersion(torch.__version__) >= LooseVersion("1.6.0")
+_BOLTS_AVAILABLE = _module_available('pl_bolts')
\ No newline at end of file
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 7d28afc6b0093..ec8a9b3178a5d 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -30,10 +30,10 @@
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
 from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler
+from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, PytorchProfiler, SimpleProfiler
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.states import TrainerState
-from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE, _PYTORCH_GREATER_EQUAL_1_6_0
+from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import BoringModel, EvalModelTemplate
@@ -1421,7 +1421,7 @@ def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, l
     ('simple', SimpleProfiler),
     ('Simple', SimpleProfiler),
     ('advanced', AdvancedProfiler),
-    ('pytorch', AdvancedProfiler),
+    ('pytorch', PytorchProfiler),
 ])
 def test_trainer_profiler_correct_args(profiler, expected):
     kwargs = {'profiler': profiler} if profiler is not None else {}
@@ -1444,7 +1444,6 @@ def test_trainer_profiler_incorrect_arg_type(profiler):
         Trainer(profiler=profiler)
 
 
-# @pytest.mark.skipif(not _PYTORCH_GREATER_EQUAL_1_6_0, reason='test needs PyTorch 1.7+')
 def test_pytorch_profiler(tmpdir):
     class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):
@@ -1464,3 +1463,13 @@ def training_step(self, batch, batch_idx):
     )
 
     trainer.fit(model)
+
+
+def test_pytorch_profiler_2(tmpdir):
+    print(f'Version = {torch.__version__}')
+
+    x = torch.rand(100, 100, device='cuda')
+
+    with torch.cuda.profiler.profile():
+        with torch.autograd.profiler.emit_nvtx():
+            temp = x * x
\ No newline at end of file

From c85661ad0a61a74b353a04dcf258ac5e45093add Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 19 Jan 2021 10:46:14 +0000
Subject: [PATCH 04/28] resolve flake8

---
 pytorch_lightning/profiler/__init__.py        |  9 +++---
 pytorch_lightning/profiler/profilers.py       | 32 ++++++++-----------
 .../trainer/connectors/profiler_connector.py  |  1 +
 pytorch_lightning/utilities/imports.py        |  2 +-
 tests/trainer/test_trainer.py                 | 10 ------
 5 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index e3339d65e80a6..ff714248642d3 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -118,17 +118,16 @@ def custom_processing_step(self, data):
 PyTorch Profiling
 --------------------
 
-Autograd includes a profiler that lets you inspect the cost of different operators inside your model - both on the CPU and GPU. 
-
-.. _cProfiler: https://docs.python.org/3/library/profile.html#module-cProfile
+Autograd includes a profiler that lets you inspect the cost of different operators
+inside your model - both on the CPU and GPU.
 
 .. code-block:: python
 
-    trainer = Trainer(..., profiler="advanced")
+    trainer = Trainer(..., profiler="pytorch")
 
     or
 
-    profiler = AdvancedProfiler()
+    profiler = PytorchProfiler()
     trainer = Trainer(..., profiler=profiler)
 
 The profiler's results will be printed at the completion of a training `fit()`. This profiler
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 6b0f68974ece5..3f541e5144d09 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -15,8 +15,8 @@
 """Profiler to check if there are any bottlenecks in your code."""
 
 import cProfile
-import io
 import inspect
+import io
 import os
 import pstats
 import time
@@ -31,7 +31,6 @@
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities import rank_zero_only
 
 
 class BaseProfiler(ABC):
@@ -296,16 +295,16 @@ class PytorchProfiler(BaseProfiler):
 
     PROFILED_FUNCTIONS = ["training_step", "validation_step", "test_step"]
 
-    def __init__(self, 
+    def __init__(self,
                  output_filename: Optional[str] = None,
                  enabled=True,
-                 use_cuda=True,
+                 use_cuda=False,
                  record_shapes=False,
                  profile_memory=False,
                  group_by_input_shape=True,
                  with_stack=True,
                  use_kineto=False,
-                 use_cpu = True,
+                 use_cpu=True,
                  emit_nvtx=False,
                  export_to_chrome=False,
                  path_to_export_trace=None,
@@ -315,7 +314,7 @@ def __init__(self,
             output_filename: optionally save profile results to file instead of printing
                 to std out when training is finished.
             enabled: Setting this to False makes this context manager a no-op. Default: True
-            use_cuda: Enables timing of CUDA events as well using the cudaEvent API. 
+            use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
                 Adds approximately 4us of overhead to each tensor operation. Default: True
             record_shapes:  If shapes recording is set, information about input dimensions will be collected.
             profile_memory: Whether to report memory usage, default: True (1.6.0)
@@ -329,10 +328,13 @@ def __init__(self,
                     * torch.autograd.profiler.load_nvprof(path)
             export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
             sort_by_key: Keys to sort out profiled table
-            path_to_export_trace: Path to exported traces. By default, it will be save where the file being is being run.
-        """ 
+            path_to_export_trace: Path to exported traces. By default, it will be save
+                where the file being is being run.
+        """
         self.profiled_actions = {}
-        self.enabled = enabled
+        # PyTorch Profiler doesn't seem to work with multiple processes
+        enabled = enabled and os.getenv("LOCAL_RANK", None) is None
+        self.profiled_actions_enabled = {n: enabled for n in self.PROFILED_FUNCTIONS}
         self.use_cuda = use_cuda
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
@@ -358,16 +360,11 @@ def __init__(self,
         super().__init__(output_streams=streaming_out)
 
     def start(self, action_name: str) -> None:
-        # PyTorch Profiler doesn't seem to work with multiple processes
-        # Disable Profiler.
-        self.enabled = os.getenv("LOCAL_RANK", None) is None
         if action_name not in self.profiled_actions and action_name in self.PROFILED_FUNCTIONS:
+            self.enabled = self.profiled_actions_enabled[action_name]
             self.profiled_actions[action_name] = []
             if self.emit_nvtx:
                 self._create_profiler(action_name, torch.cuda.profiler.profile, enter=False)
-                # warmup
-                x = torch.rand(100, 100, device='cuda')
-                temp = x * x
                 self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx)
             else:
                 self._create_profiler(action_name, torch.autograd.profiler.profile)
@@ -392,11 +389,10 @@ def stop(self, action_name: str) -> None:
             else:
                 for pr in profilers[::-1]:
                     self._handle_exit(pr)
+        self.profiled_actions_enabled[action_name] = True
 
     def _handle_exit(self, pr):
         # todo: Find a better solution to exit context manager
-        if pr is None:
-            return
         try:
             _ = pr.__exit__(None, None, None)
         except RuntimeError as e:
@@ -407,7 +403,7 @@ def _handle_exit(self, pr):
             elif "generator didn't stop" in str(e):
                 pass
             else:
-                raise RuntimeError(str(e))        
+                raise RuntimeError(str(e))
 
     def summary(self) -> str:
         recorded_stats = {}
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index b7f333626a152..2daf0ae2b9e4a 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -30,6 +30,7 @@
     "pytorch": PytorchProfiler
 }
 
+
 class ProfilerConnector:
 
     def __init__(self, trainer):
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 19493f0619a79..acdebfbf239e4 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -52,4 +52,4 @@ def _module_available(module_path: str) -> bool:
 _RPC_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.rpc')
 _GROUP_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.group')
 _FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and LooseVersion(torch.__version__) >= LooseVersion("1.6.0")
-_BOLTS_AVAILABLE = _module_available('pl_bolts')
\ No newline at end of file
+_BOLTS_AVAILABLE = _module_available('pl_bolts')
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ec8a9b3178a5d..007285a0416ec 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1463,13 +1463,3 @@ def training_step(self, batch, batch_idx):
     )
 
     trainer.fit(model)
-
-
-def test_pytorch_profiler_2(tmpdir):
-    print(f'Version = {torch.__version__}')
-
-    x = torch.rand(100, 100, device='cuda')
-
-    with torch.cuda.profiler.profile():
-        with torch.autograd.profiler.emit_nvtx():
-            temp = x * x
\ No newline at end of file

From 9a62eb84759e9368a20da660989a978181b63f51 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 19 Jan 2021 11:03:59 +0000
Subject: [PATCH 05/28] update doc

---
 pytorch_lightning/profiler/__init__.py  | 89 +++++++++++++++----------
 pytorch_lightning/profiler/profilers.py | 14 ++--
 2 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index ff714248642d3..6d82889294eae 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -127,56 +127,75 @@ def custom_processing_step(self, data):
 
     or
 
-    profiler = PytorchProfiler()
+    profiler = PytorchProfiler(
+            output_filename = ...
+            enabled = ...
+            use_cuda = ...
+            record_shapes = ...
+            profile_memory = ...
+            with_stack = ...
+            use_kineto = ...
+            use_cpu = ...
+            emit_nvtx = ...
+            export_to_chrome = ...
+            sort_by_key = ...
+            path_to_export_trace = ...
+    )
     trainer = Trainer(..., profiler=profiler)
 
 The profiler's results will be printed at the completion of a training `fit()`. This profiler
 report can be quite long, so you can also specify an `output_filename` to save the report instead
 of logging it to the output in your terminal. The output below shows the profiling for the action
 `get_train_batch`.
+This profiler will record only for `training_step`, `evaluation_step` and `test_step` functions.
 
 .. code-block:: python
 
     Profiler Report
 
-    Profile stats for: get_train_batch
-            4869394 function calls (4863767 primitive calls) in 18.893 seconds
-    Ordered by: cumulative time
-    List reduced from 76 to 10 due to restriction <10>
-    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
-    3752/1876    0.011    0.000   18.887    0.010 {built-in method builtins.next}
-        1876     0.008    0.000   18.877    0.010 dataloader.py:344(__next__)
-        1876     0.074    0.000   18.869    0.010 dataloader.py:383(_next_data)
-        1875     0.012    0.000   18.721    0.010 fetch.py:42(fetch)
-        1875     0.084    0.000   18.290    0.010 fetch.py:44(<listcomp>)
-        60000    1.759    0.000   18.206    0.000 mnist.py:80(__getitem__)
-        60000    0.267    0.000   13.022    0.000 transforms.py:68(__call__)
-        60000    0.182    0.000    7.020    0.000 transforms.py:93(__call__)
-        60000    1.651    0.000    6.839    0.000 functional.py:42(to_tensor)
-        60000    0.260    0.000    5.734    0.000 transforms.py:167(__call__)
-
-You can also reference this profiler in your LightningModule to profile specific actions of interest.
-If you don't want to always have the profiler turned on, you can optionally pass a `PassThroughProfiler`
-which will allow you to skip profiling without having to make any code changes. Each profiler has a
-method `profile()` which returns a context handler. Simply pass in the name of your action that you want
-to track and the profiler will record performance for code executed within this context.
+    Profile stats for: training_step
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    t                      62.10%           1.044ms          62.77%           1.055ms          1.055ms
+    addmm                  32.32%           543.135us        32.69%           549.362us        549.362us
+    mse_loss               1.35%            22.657us         3.58%            60.105us         60.105us
+    mean                   0.22%            3.694us          2.05%            34.523us         34.523us
+    div_                   0.64%            10.756us         1.90%            32.001us         16.000us
+    ones_like              0.21%            3.461us          0.81%            13.669us         13.669us
+    sum_out                0.45%            7.638us          0.74%            12.432us         12.432us
+    transpose              0.23%            3.786us          0.68%            11.393us         11.393us
+    as_strided             0.60%            10.060us         0.60%            10.060us         3.353us
+    to                     0.18%            3.059us          0.44%            7.464us          7.464us
+    empty_like             0.14%            2.387us          0.41%            6.859us          6.859us
+    empty_strided          0.38%            6.351us          0.38%            6.351us          3.175us
+    fill_                  0.28%            4.782us          0.33%            5.566us          2.783us
+    expand                 0.20%            3.336us          0.28%            4.743us          4.743us
+    empty                  0.27%            4.456us          0.27%            4.456us          2.228us
+    copy_                  0.15%            2.526us          0.15%            2.526us          2.526us
+    broadcast_tensors      0.15%            2.492us          0.15%            2.492us          2.492us
+    size                   0.06%            0.967us          0.06%            0.967us          0.484us
+    is_complex             0.06%            0.961us          0.06%            0.961us          0.481us
+    stride                 0.03%            0.517us          0.03%            0.517us          0.517us
+    ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
+    Self CPU time total: 1.681ms
+
+When running with `PytorchProfiler(emit_nvtx=True)`. You should run as following:
+
+nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+
+To visualize the profiled operation, you can either:
+
+* Use: nvvp trace_name.prof
+
+* Use: torch.autograd.profiler.load_nvprof(path)
 
 .. code-block:: python
 
-    from pytorch_lightning.profiler import Profiler, PassThroughProfiler
-
-    class MyModel(LightningModule):
-        def __init__(self, profiler=None):
-            self.profiler = profiler or PassThroughProfiler()
+    >>> import torch
+    >>> torch.autograd.profiler.load_nvprof(".../trace_name.prof")
+    [<FunctionEvent id=2 node_id=0 cpu_time=2050.393s ... is_remote=True seq_nr=-1>]
 
-        def custom_processing_step(self, data):
-            with profiler.profile('my_custom_action'):
-                # custom processing step
-            return data
-
-    profiler = Profiler()
-    model = MyModel(profiler)
-    trainer = Trainer(profiler=profiler, max_epochs=1)
 
 """
 
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 3f541e5144d09..c961d9c830df5 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -301,10 +301,10 @@ def __init__(self,
                  use_cuda=False,
                  record_shapes=False,
                  profile_memory=False,
-                 group_by_input_shape=True,
-                 with_stack=True,
+                 group_by_input_shape=False,
+                 with_stack=False,
                  use_kineto=False,
-                 use_cpu=True,
+                 use_cpu=False,
                  emit_nvtx=False,
                  export_to_chrome=False,
                  path_to_export_trace=None,
@@ -369,15 +369,15 @@ def start(self, action_name: str) -> None:
             else:
                 self._create_profiler(action_name, torch.autograd.profiler.profile)
 
-    def _create_profiler(self, action_name, profiler, enter=False):
+    def _create_profiler(self, action_name, profiler, enter=True):
         init_args = inspect.signature(profiler.__init__).parameters
         profiler_args = {
             k: v for k, v in vars(self).items() if k in init_args
         }
-        profiler = profiler(**profiler_args)
+        pr = profiler(**profiler_args)
         if enter:
-            profiler = profiler.__enter__()
-        self.profiled_actions[action_name].append(profiler)
+            pr = pr.__enter__()
+        self.profiled_actions[action_name].append(pr)
 
     def stop(self, action_name: str) -> None:
         if action_name in self.PROFILED_FUNCTIONS and self.enabled:

From 6f54b69d2960cb17a2460b36e5be8d52cd58a183 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 19 Jan 2021 11:04:49 +0000
Subject: [PATCH 06/28] update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b56321765bbf0..47d7429a29c0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -57,6 +57,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842))
 
 
+- Added `PytorchProfiler` ([#5560](https://github.com/PyTorchLightning/pytorch-lightning/pull/5560))
+
 
 ### Changed
 

From 1bbe314dee11c396adf571ee991f64cc17ec58bd Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 19 Jan 2021 11:23:05 +0000
Subject: [PATCH 07/28] clean doc

---
 pytorch_lightning/profiler/__init__.py | 9 +--------
 trace_name.prof                        | 0
 2 files changed, 1 insertion(+), 8 deletions(-)
 create mode 100644 trace_name.prof

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 6d82889294eae..bc3c9c5bc29ed 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -188,14 +188,7 @@ def custom_processing_step(self, data):
 
 * Use: nvvp trace_name.prof
 
-* Use: torch.autograd.profiler.load_nvprof(path)
-
-.. code-block:: python
-
-    >>> import torch
-    >>> torch.autograd.profiler.load_nvprof(".../trace_name.prof")
-    [<FunctionEvent id=2 node_id=0 cpu_time=2050.393s ... is_remote=True seq_nr=-1>]
-
+* Use: python -c 'import torch; print(torch.autograd.profiler.load_nvprof("trace_name.prof"))'
 
 """
 
diff --git a/trace_name.prof b/trace_name.prof
new file mode 100644
index 0000000000000..e69de29bb2d1d

From bd035da941b0751307215bcb7d98d81a69ba3b4c Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 19 Jan 2021 11:30:58 +0000
Subject: [PATCH 08/28] delete prof file

---
 .gitignore      | 1 +
 trace_name.prof | 0
 2 files changed, 1 insertion(+)
 delete mode 100644 trace_name.prof

diff --git a/.gitignore b/.gitignore
index 743fdaaf33dc2..237dbef370a2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,4 @@ pytorch\ lightning
 test-reports/
 wandb
 .forked/
+*.prof
diff --git a/trace_name.prof b/trace_name.prof
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From e689cdabef902f997b99de5d534a1a143f03c0cc Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 21 Jan 2021 19:00:31 +0000
Subject: [PATCH 09/28] merge pr codebase

---
 CHANGELOG.md                                  |   2 +-
 pytorch_lightning/profiler/__init__.py        |  25 +--
 pytorch_lightning/profiler/profilers.py       | 163 +++++++++++-------
 .../trainer/connectors/profiler_connector.py  |   4 +-
 tests/trainer/test_trainer.py                 |  75 ++++++--
 5 files changed, 174 insertions(+), 95 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47d7429a29c0d..537cc317dae3d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -57,7 +57,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842))
 
 
-- Added `PytorchProfiler` ([#5560](https://github.com/PyTorchLightning/pytorch-lightning/pull/5560))
+- Added `PyTorchProfiler` ([#5560](https://github.com/PyTorchLightning/pytorch-lightning/pull/5560))
 
 
 ### Changed
diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index bc3c9c5bc29ed..3f395fdcfd9aa 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -121,29 +121,18 @@ def custom_processing_step(self, data):
 Autograd includes a profiler that lets you inspect the cost of different operators
 inside your model - both on the CPU and GPU.
 
+Find the Pytorch Profiler doc at [PyTorch Profiler](https://pytorch-lightning.readthedocs.io/en/stable/profiler.html)
+
 .. code-block:: python
 
     trainer = Trainer(..., profiler="pytorch")
 
     or
 
-    profiler = PytorchProfiler(
-            output_filename = ...
-            enabled = ...
-            use_cuda = ...
-            record_shapes = ...
-            profile_memory = ...
-            with_stack = ...
-            use_kineto = ...
-            use_cpu = ...
-            emit_nvtx = ...
-            export_to_chrome = ...
-            sort_by_key = ...
-            path_to_export_trace = ...
-    )
+    profiler = PyTorchProfiler(...)
     trainer = Trainer(..., profiler=profiler)
 
-The profiler's results will be printed at the completion of a training `fit()`. This profiler
+The profiler's results will be printed on the completion of a training `fit()`. This profiler
 report can be quite long, so you can also specify an `output_filename` to save the report instead
 of logging it to the output in your terminal. The output below shows the profiling for the action
 `get_train_batch`.
@@ -180,7 +169,7 @@ def custom_processing_step(self, data):
     ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
     Self CPU time total: 1.681ms
 
-When running with `PytorchProfiler(emit_nvtx=True)`. You should run as following:
+When running with `PyTorchProfiler(emit_nvtx=True)`. You should run as following:
 
 nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
 
@@ -196,7 +185,7 @@ def custom_processing_step(self, data):
     AdvancedProfiler,
     BaseProfiler,
     PassThroughProfiler,
-    PytorchProfiler,
+    PyTorchProfiler,
     SimpleProfiler,
 )
 
@@ -205,5 +194,5 @@ def custom_processing_step(self, data):
     'SimpleProfiler',
     'AdvancedProfiler',
     'PassThroughProfiler',
-    "PytorchProfiler",
+    "PyTorchProfiler",
 ]
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index c961d9c830df5..6c2141db50330 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -287,62 +287,83 @@ def __del__(self):
             self.output_file.close()
 
 
-class PytorchProfiler(BaseProfiler):
+class PyTorchProfiler(BaseProfiler):
     """
-    This profiler uses PyTorch's Autograd Profiler and let's you inspect the cost of
+    This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
     different operators inside your model - both on the CPU and GPU
     """
 
-    PROFILED_FUNCTIONS = ["training_step", "validation_step", "test_step"]
+    PROFILER_OVERHEAD_MAX_TOLERANCE = 0.0005
 
     def __init__(self,
                  output_filename: Optional[str] = None,
-                 enabled=True,
-                 use_cuda=False,
-                 record_shapes=False,
-                 profile_memory=False,
-                 group_by_input_shape=False,
-                 with_stack=False,
-                 use_kineto=False,
-                 use_cpu=False,
-                 emit_nvtx=False,
-                 export_to_chrome=False,
-                 path_to_export_trace=None,
-                 sort_by_key: str = "cpu_time_total"):
+                 enabled: bool = True,
+                 use_cuda: bool = False,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 group_by_input_shapes: bool = False,
+                 with_stack: bool = False,
+                 use_kineto: bool = False,
+                 use_cpu: bool = False,
+                 emit_nvtx: bool = False,
+                 export_to_chrome: bool = False,
+                 path_to_export_trace: bool = None,
+                 row_limit: int = 20,
+                 sort_by_key: Optional[str] = None,
+                 profiled_functions=["training_step_and_backward", "validation_step", "test_step"]):
         """
         Args:
+
             output_filename: optionally save profile results to file instead of printing
                 to std out when training is finished.
+
             enabled: Setting this to False makes this context manager a no-op. Default: True
+
             use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
                 Adds approximately 4us of overhead to each tensor operation. Default: True
+
             record_shapes:  If shapes recording is set, information about input dimensions will be collected.
+
             profile_memory: Whether to report memory usage, default: True (1.6.0)
+
             with_stack: record source information (file and line number) for the ops (1.7.0)
+
             use_kineto: experimental support for Kineto profiler (1.8.0)
+
             use_cpu: use_kineto=True and can be used to lower the overhead for GPU-only profiling (1.8.0)
+
             emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
                 * Run: nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
                 To visualize, you can either use:
                     * nvvp trace_name.prof
                     * torch.autograd.profiler.load_nvprof(path)
+
             export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
-            sort_by_key: Keys to sort out profiled table
+
             path_to_export_trace: Path to exported traces. By default, it will be save
                 where the file being is being run.
+
+            row_limit: Limit the number of rows in a table, `0` is a special value that
+                 removes the limit completely.
+
+            sort_by_key: Keys to sort out profiled table
+
+            profiled_functions: list of profiled functions which will create a context manager on.
+                Any other will be pass through.
         """
         self.profiled_actions = {}
         # PyTorch Profiler doesn't seem to work with multiple processes
-        enabled = enabled and os.getenv("LOCAL_RANK", None) is None
-        self.profiled_actions_enabled = {n: enabled for n in self.PROFILED_FUNCTIONS}
+        self.enabled = enabled and os.getenv("LOCAL_RANK", None) is None
+        self.profiled_functions = profiled_functions
         self.use_cuda = use_cuda
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
-        self.sort_by_key = sort_by_key
+        self.sort_by_key = sort_by_key or ("cuda_time_total" if self.use_cuda else "cpu_time_total")
         self.with_stack = with_stack
-        self.group_by_input_shape = group_by_input_shape and record_shapes
+        self.group_by_input_shapes = group_by_input_shapes and record_shapes
         self.use_kineto = use_kineto
         self.use_cpu = use_cpu
+        self.row_limit = row_limit
         self.emit_nvtx = emit_nvtx
         self.export_to_chrome = export_to_chrome
         self.path_to_export_trace = path_to_export_trace
@@ -350,8 +371,14 @@ def __init__(self,
             raise MisconfigurationException(
                 f"Found sort_by_key: {sort_by_key}. Should be within {self.available_sort_by_keys}. ")
 
+        self.profiled_actions = {}
+        self.context_names = {}
+        self.running_stack = []
+        self.profiler = None
+
         self.output_fname = output_filename
         self.output_file = None
+
         if self.output_fname:
             fs = get_filesystem(self.output_fname)
             self.output_file = fs.open(self.output_fname, "w")
@@ -360,14 +387,22 @@ def __init__(self,
         super().__init__(output_streams=streaming_out)
 
     def start(self, action_name: str) -> None:
-        if action_name not in self.profiled_actions and action_name in self.PROFILED_FUNCTIONS:
-            self.enabled = self.profiled_actions_enabled[action_name]
-            self.profiled_actions[action_name] = []
-            if self.emit_nvtx:
-                self._create_profiler(action_name, torch.cuda.profiler.profile, enter=False)
-                self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx)
-            else:
-                self._create_profiler(action_name, torch.autograd.profiler.profile)
+        # stop the running profiler if any
+        if action_name in self.profiled_functions:
+            if len(self.running_stack) > 0:
+                self._stop(self.running_stack[-1])
+            self.running_stack.append(action_name)
+
+            self.context_names[action_name] = "/".join(self.running_stack)
+
+            self._start(action_name)
+
+    def _start(self, action_name: str) -> None:
+        if self.emit_nvtx:
+            self._create_profiler(action_name, torch.cuda.profiler.profile, enter=False)
+            self._create_profiler(action_name, torch.autograd.profiler.emit_nvtx)
+        else:
+            self._create_profiler(action_name, torch.autograd.profiler.profile)
 
     def _create_profiler(self, action_name, profiler, enter=True):
         init_args = inspect.signature(profiler.__init__).parameters
@@ -377,49 +412,62 @@ def _create_profiler(self, action_name, profiler, enter=True):
         pr = profiler(**profiler_args)
         if enter:
             pr = pr.__enter__()
-        self.profiled_actions[action_name].append(pr)
+        self.profiler = pr
+
+    def _stop(self, action_name: str) -> None:
+        if self.profiler is None:
+            return
+        self.profiler.__exit__(
+            exc_type=None,
+            exc_val=None,
+            exc_tb=None
+        )
+        events = self.profiler.function_events
+        self.profiler = None
+        for name in self.running_stack:
+            if name not in self.profiled_actions:
+                self.profiled_actions[name] = events
+            else:
+                self.profiled_actions[name] += events
 
     def stop(self, action_name: str) -> None:
-        if action_name in self.PROFILED_FUNCTIONS and self.enabled:
-            profilers = self.profiled_actions.get(action_name)
-            if not profilers:
+        if action_name in self.profiled_functions:
+            if len(self.running_stack) == 0 or self.running_stack[-1] != action_name:
                 raise ValueError(  # pragma: no-cover
                     f"Attempting to stop recording an action ({action_name}) which was never started."
                 )
-            else:
-                for pr in profilers[::-1]:
-                    self._handle_exit(pr)
-        self.profiled_actions_enabled[action_name] = True
-
-    def _handle_exit(self, pr):
-        # todo: Find a better solution to exit context manager
-        try:
-            _ = pr.__exit__(None, None, None)
-        except RuntimeError as e:
-            if "Expected debug info of type 2" in str(e):
-                pass
-            elif "can't disable profiler when it's not running" in str(e):
-                pass
-            elif "generator didn't stop" in str(e):
-                pass
-            else:
-                raise RuntimeError(str(e))
+            self._stop(action_name)
+            self.running_stack.pop()
+            # restore running profiler
+            if len(self.running_stack) > 0:
+                self._start(self.running_stack[-1])
 
     def summary(self) -> str:
         recorded_stats = {}
+        output_string = ''
+
         if self.enabled:
-            for action_name, pr in self.profiled_actions.items():
-                pr = pr[-1]
+            for action_name, events in self.profiled_actions.items():
+
+                # next line is a workaround for a pytorch issue (fixed on master, still present
+                # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
+                # parent event for detach`
+                events.populate_cpu_children = lambda: None
+
                 if self.export_to_chrome:
                     filename = f"{action_name}_trace.json"
                     path_to_trace = filename if self.path_to_export_trace is None \
                         else os.path.join(self.path_to_export_trace, filename)
-                    pr.export_chrome_trace(path_to_trace)
+                    events.export_chrome_trace(path_to_trace)
+
                 if self.emit_nvtx:
-                    return ""
+                    return output_string
+
                 else:
-                    table = pr.key_averages(
-                        group_by_input_shape=self.group_by_input_shape).table(sort_by=self.sort_by_key)
+                    table = events.key_averages(
+                        group_by_input_shapes=self.group_by_input_shapes).table(
+                            sort_by=self.sort_by_key,
+                            row_limit=self.row_limit)
                     recorded_stats[action_name] = table
 
             # log to standard out
@@ -429,8 +477,7 @@ def summary(self) -> str:
                     f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
                 )
 
-            return output_string
-        return ''
+        return output_string
 
     def describe(self):
         """Logs a profile report after the conclusion of the training run."""
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index 2daf0ae2b9e4a..d2e6ada35412e 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -18,7 +18,7 @@
     AdvancedProfiler,
     BaseProfiler,
     PassThroughProfiler,
-    PytorchProfiler,
+    PyTorchProfiler,
     SimpleProfiler,
 )
 from pytorch_lightning.utilities import rank_zero_warn
@@ -27,7 +27,7 @@
 PROFILERS = {
     "simple": SimpleProfiler,
     "advanced": AdvancedProfiler,
-    "pytorch": PytorchProfiler
+    "pytorch": PyTorchProfiler
 }
 
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 007285a0416ec..74d99a1eea8cc 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -30,7 +30,7 @@
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
 from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, PytorchProfiler, SimpleProfiler
+from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, PyTorchProfiler, SimpleProfiler
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
@@ -39,6 +39,12 @@
 from tests.base import BoringModel, EvalModelTemplate
 
 
+@pytest.fixture
+def pytorch_profiler(tmpdir):
+    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
+    return profiler
+
+
 @pytest.mark.parametrize("url_ckpt", [True, False])
 def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
     """Tests use case where trainer saves the model, and user loads it from tags independently."""
@@ -1421,7 +1427,7 @@ def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, l
     ('simple', SimpleProfiler),
     ('Simple', SimpleProfiler),
     ('advanced', AdvancedProfiler),
-    ('pytorch', PytorchProfiler),
+    ('pytorch', PyTorchProfiler),
 ])
 def test_trainer_profiler_correct_args(profiler, expected):
     kwargs = {'profiler': profiler} if profiler is not None else {}
@@ -1444,22 +1450,59 @@ def test_trainer_profiler_incorrect_arg_type(profiler):
         Trainer(profiler=profiler)
 
 
-def test_pytorch_profiler(tmpdir):
-    class TestModel(BoringModel):
-        def training_step(self, batch, batch_idx):
-            output = self.layer(batch)
-            loss = self.loss(batch, output)
-            return {"loss": loss}
+def _get_pytorch_profiler_total_duration(events):
+    total_time = sum([e.cpu_time + e.cuda_time for e in events])
+    return total_time / 1e6  # convert microseconds to seconds
 
-    model = TestModel()
 
-    limit_train_batches = 2
+def test_autograd_profiler_overhead(pytorch_profiler, n_iter=5):
+    """Ensure that the profiler doesn't introduce too much overhead during training."""
+    for _ in range(n_iter):
+        with pytorch_profiler.profile("test_step"):
+            a = torch.ones(42)
+            b = torch.abs(a)
+            _ = a + b
+
+    action_profile = pytorch_profiler.profiled_actions["test_step"]
+    total_duration = _get_pytorch_profiler_total_duration(action_profile)
+    average_duration = total_duration / n_iter
+    assert average_duration < pytorch_profiler.PROFILER_OVERHEAD_MAX_TOLERANCE
+    pytorch_profiler.describe()
+    data = Path(pytorch_profiler.output_fname).read_text()
+    assert len(data) > 0
+
+
+def test_autograd_profiler_describe(tmpdir, pytorch_profiler):
+    """Ensure the profiler won't fail when reporting the summary."""
+    with pytorch_profiler.profile("test_step"):
+        pass
+
+    # log to stdout and print to file
+    pytorch_profiler.describe()
+    data = Path(pytorch_profiler.output_fname).read_text()
+    assert len(data) > 0
+
+
+def test_pytorch_profiler_value_errors(pytorch_profiler):
+    """Ensure errors are raised where expected."""
+
+    action = "test_step"
+    with pytest.raises(ValueError):
+        pytorch_profiler.stop(action)
+
+    pytorch_profiler.start(action)
+    pytorch_profiler.stop(action)
+
+
+def test_pytorch_profiler_trainer(tmpdir):
+
+    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
+
+    model = BoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=limit_train_batches,
-        limit_val_batches=2,
-        max_epochs=1,
-        profiler='pytorch'
+        fast_dev_run=True,
+        profiler=profiler
     )
-
     trainer.fit(model)
+    assert len(profiler.summary()) > 0
+    assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}

From 803aaa2cfc4964ea28c3fc7a8cc5115df7133ba5 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 21 Jan 2021 19:07:12 +0000
Subject: [PATCH 10/28] update

---
 pytorch_lightning/profiler/profilers.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 6c2141db50330..fe90884f9e8c1 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -293,7 +293,7 @@ class PyTorchProfiler(BaseProfiler):
     different operators inside your model - both on the CPU and GPU
     """
 
-    PROFILER_OVERHEAD_MAX_TOLERANCE = 0.0005
+    PROFILER_OVERHEAD_MAX_TOLERANCE = 7.5e-4
 
     def __init__(self,
                  output_filename: Optional[str] = None,
@@ -417,11 +417,13 @@ def _create_profiler(self, action_name, profiler, enter=True):
     def _stop(self, action_name: str) -> None:
         if self.profiler is None:
             return
+
         self.profiler.__exit__(
             exc_type=None,
             exc_val=None,
             exc_tb=None
         )
+
         events = self.profiler.function_events
         self.profiler = None
         for name in self.running_stack:

From 698b43adcab4cb9662ae80f78fdda290e1a742ad Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 21 Jan 2021 19:19:50 +0000
Subject: [PATCH 11/28] update doc

---
 pytorch_lightning/profiler/__init__.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 3f395fdcfd9aa..cab5eb191f906 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -134,15 +134,16 @@ def custom_processing_step(self, data):
 
 The profiler's results will be printed on the completion of a training `fit()`. This profiler
 report can be quite long, so you can also specify an `output_filename` to save the report instead
-of logging it to the output in your terminal. The output below shows the profiling for the action
-`get_train_batch`.
-This profiler will record only for `training_step`, `evaluation_step` and `test_step` functions.
+of logging it to the output in your terminal.
+
+This profiler will record only for `training_step_and_backward`, `evaluation_step` and `test_step` functions by default.
+The output below shows the profiling for the action `training_step_and_backward`.
 
 .. code-block:: python
 
     Profiler Report
 
-    Profile stats for: training_step
+    Profile stats for: training_step_and_backward
     ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
     Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg
     ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
@@ -169,6 +170,17 @@ def custom_processing_step(self, data):
     ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
     Self CPU time total: 1.681ms
 
+If you need to profile more functions, do as follow:
+
+.. code-block:: python
+
+    profiler = Profiler(profiled_functions=["my_own_profiled_function"])
+
+    with profiler.profile("my_own_profiled_function"):
+
+        ...
+
+
 When running with `PyTorchProfiler(emit_nvtx=True)`. You should run as following:
 
 nvprof --profile-from-start off -o trace_name.prof -- <regular command here>

From da9a56d6220a9b035920713e0329425f87faee80 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 21 Jan 2021 19:24:47 +0000
Subject: [PATCH 12/28] update doc

---
 pytorch_lightning/profiler/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index cab5eb191f906..197ad1a7f7fc9 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -178,7 +178,7 @@ def custom_processing_step(self, data):
 
     with profiler.profile("my_own_profiled_function"):
 
-        ...
+        pass
 
 
 When running with `PyTorchProfiler(emit_nvtx=True)`. You should run as following:

From 3b119fd4bcfb176a590cb836740ef6eae27e6631 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 21 Jan 2021 19:46:01 +0000
Subject: [PATCH 13/28] update doc

---
 pytorch_lightning/profiler/__init__.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 197ad1a7f7fc9..ef8189717a02f 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -170,17 +170,6 @@ def custom_processing_step(self, data):
     ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
     Self CPU time total: 1.681ms
 
-If you need to profile more functions, do as follow:
-
-.. code-block:: python
-
-    profiler = Profiler(profiled_functions=["my_own_profiled_function"])
-
-    with profiler.profile("my_own_profiled_function"):
-
-        pass
-
-
 When running with `PyTorchProfiler(emit_nvtx=True)`. You should run as following:
 
 nvprof --profile-from-start off -o trace_name.prof -- <regular command here>

From 75c966f1e1d3275c8f2553ec718382edfcf99b3f Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 08:58:05 +0000
Subject: [PATCH 14/28] update on comments

---
 pytorch_lightning/profiler/profilers.py | 85 ++++++++++++++-----------
 tests/trainer/test_trainer.py           | 32 +++++++++-
 2 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index fe90884f9e8c1..927942e432354 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -23,13 +23,14 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from contextlib import contextmanager
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import numpy as np
 import torch
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities.cloud_io import get_filesystem
+from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -294,23 +295,31 @@ class PyTorchProfiler(BaseProfiler):
     """
 
     PROFILER_OVERHEAD_MAX_TOLERANCE = 7.5e-4
-
-    def __init__(self,
-                 output_filename: Optional[str] = None,
-                 enabled: bool = True,
-                 use_cuda: bool = False,
-                 record_shapes: bool = False,
-                 profile_memory: bool = False,
-                 group_by_input_shapes: bool = False,
-                 with_stack: bool = False,
-                 use_kineto: bool = False,
-                 use_cpu: bool = False,
-                 emit_nvtx: bool = False,
-                 export_to_chrome: bool = False,
-                 path_to_export_trace: bool = None,
-                 row_limit: int = 20,
-                 sort_by_key: Optional[str] = None,
-                 profiled_functions=["training_step_and_backward", "validation_step", "test_step"]):
+    PROFILED_FUNCTIONS = ["training_step_and_backward", "validation_step", "test_step"]
+    AVAILABLE_SORT_KEYS = [
+        "cpu_time", "cuda_time", "cpu_time_total",
+        "cuda_time_total", "cpu_memory_usage", "cuda_memory_usage",
+        "self_cpu_memory_usage", "self_cuda_memory_usage", "count"
+    ]
+
+    def __init__(
+        self,
+        output_filename: Optional[str] = None,
+        enabled: bool = True,
+        use_cuda: bool = False,
+        record_shapes: bool = False,
+        profile_memory: bool = False,
+        group_by_input_shapes: bool = False,
+        with_stack: bool = False,
+        use_kineto: bool = False,
+        use_cpu: bool = False,
+        emit_nvtx: bool = False,
+        export_to_chrome: bool = False,
+        path_to_export_trace: str = None,
+        row_limit: int = 20,
+        sort_by_key: Optional[str] = None,
+        profiled_functions: Optional[List] = None,
+    ):
         """
         Args:
 
@@ -326,6 +335,8 @@ def __init__(self,
 
             profile_memory: Whether to report memory usage, default: True (1.6.0)
 
+            group_by_input_shapes: Include operator input shapes and group calls by shape.
+
             with_stack: record source information (file and line number) for the ops (1.7.0)
 
             use_kineto: experimental support for Kineto profiler (1.8.0)
@@ -340,7 +351,7 @@ def __init__(self,
 
             export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
 
-            path_to_export_trace: Path to exported traces. By default, it will be save
+            path_to_export_trace: Directory path to export traces. By default, it will be save
                 where the file being is being run.
 
             row_limit: Limit the number of rows in a table, `0` is a special value that
@@ -351,10 +362,12 @@ def __init__(self,
             profiled_functions: list of profiled functions which will create a context manager on.
                 Any other will be pass through.
         """
+
         self.profiled_actions = {}
         # PyTorch Profiler doesn't seem to work with multiple processes
+        # todo: Try to find a solution
         self.enabled = enabled and os.getenv("LOCAL_RANK", None) is None
-        self.profiled_functions = profiled_functions
+        self.profiled_functions = profiled_functions or self.PROFILED_FUNCTIONS
         self.use_cuda = use_cuda
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
@@ -367,9 +380,15 @@ def __init__(self,
         self.emit_nvtx = emit_nvtx
         self.export_to_chrome = export_to_chrome
         self.path_to_export_trace = path_to_export_trace
-        if self.sort_by_key not in self.available_sort_by_keys:
+
+        if export_to_chrome and path_to_export_trace is None:
+            rank_zero_warn(
+                "The exported trace would be save locally as `path_to_export_trace` is empty"
+                "Note: Each functions will generate its own traced file. ")
+
+        if self.sort_by_key not in self.AVAILABLE_SORT_KEYS:
             raise MisconfigurationException(
-                f"Found sort_by_key: {sort_by_key}. Should be within {self.available_sort_by_keys}. ")
+                f"Found sort_by_key: {sort_by_key}. Should be within {self.AVAILABLE_SORT_KEYS}. ")
 
         self.profiled_actions = {}
         self.context_names = {}
@@ -424,13 +443,13 @@ def _stop(self, action_name: str) -> None:
             exc_tb=None
         )
 
-        events = self.profiler.function_events
+        function_events = self.profiler.function_events
         self.profiler = None
         for name in self.running_stack:
             if name not in self.profiled_actions:
-                self.profiled_actions[name] = events
+                self.profiled_actions[name] = function_events
             else:
-                self.profiled_actions[name] += events
+                self.profiled_actions[name] += function_events
 
     def stop(self, action_name: str) -> None:
         if action_name in self.profiled_functions:
@@ -449,24 +468,24 @@ def summary(self) -> str:
         output_string = ''
 
         if self.enabled:
-            for action_name, events in self.profiled_actions.items():
+            for action_name, function_events in self.profiled_actions.items():
 
                 # next line is a workaround for a pytorch issue (fixed on master, still present
                 # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
                 # parent event for detach`
-                events.populate_cpu_children = lambda: None
+                function_events.populate_cpu_children = lambda: None
 
                 if self.export_to_chrome:
                     filename = f"{action_name}_trace.json"
                     path_to_trace = filename if self.path_to_export_trace is None \
                         else os.path.join(self.path_to_export_trace, filename)
-                    events.export_chrome_trace(path_to_trace)
+                    function_events.export_chrome_trace(path_to_trace)
 
                 if self.emit_nvtx:
                     return output_string
 
                 else:
-                    table = events.key_averages(
+                    table = function_events.key_averages(
                         group_by_input_shapes=self.group_by_input_shapes).table(
                             sort_by=self.sort_by_key,
                             row_limit=self.row_limit)
@@ -491,11 +510,3 @@ def __del__(self):
         """Close profiler's stream."""
         if self.output_file:
             self.output_file.close()
-
-    @property
-    def available_sort_by_keys(self):
-        return [
-            "cpu_time", "cuda_time", "cpu_time_total",
-            "cuda_time_total", "cpu_memory_usage", "cuda_memory_usage",
-            "self_cpu_memory_usage", "self_cuda_memory_usage", "count"
-        ]
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 74d99a1eea8cc..a0523a2927256 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1455,7 +1455,7 @@ def _get_pytorch_profiler_total_duration(events):
     return total_time / 1e6  # convert microseconds to seconds
 
 
-def test_autograd_profiler_overhead(pytorch_profiler, n_iter=5):
+def test_pytorch_profiler_overhead(pytorch_profiler, n_iter=5):
     """Ensure that the profiler doesn't introduce too much overhead during training."""
     for _ in range(n_iter):
         with pytorch_profiler.profile("test_step"):
@@ -1472,7 +1472,7 @@ def test_autograd_profiler_overhead(pytorch_profiler, n_iter=5):
     assert len(data) > 0
 
 
-def test_autograd_profiler_describe(tmpdir, pytorch_profiler):
+def test_pytorch_profiler_describe(tmpdir, pytorch_profiler):
     """Ensure the profiler won't fail when reporting the summary."""
     with pytorch_profiler.profile("test_step"):
         pass
@@ -1495,6 +1495,7 @@ def test_pytorch_profiler_value_errors(pytorch_profiler):
 
 
 def test_pytorch_profiler_trainer(tmpdir):
+    """Ensure that the profiler can be given to the training and default step are properly recorded. """
 
     profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
 
@@ -1506,3 +1507,30 @@ def test_pytorch_profiler_trainer(tmpdir):
     trainer.fit(model)
     assert len(profiler.summary()) > 0
     assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
+
+
+def test_pytorch_profiler_nested(tmpdir):
+    """Ensure that the profiler handles nested context"""
+
+    pytorch_profiler = PyTorchProfiler(
+        profiled_functions=["a", "b", "c"],
+        use_cuda=False,
+        output_filename=os.path.join(tmpdir, "profiler.txt"))
+
+    with pytorch_profiler.profile("a"):
+        a = torch.ones(42)
+        with pytorch_profiler.profile("b"):
+            b = torch.zeros(42)
+        with pytorch_profiler.profile("c"):
+            _ = a + b
+
+    pa = pytorch_profiler.profiled_actions
+
+    expected_a = ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty']
+    assert [e.name for e in pa['a']] == expected_a
+
+    expected_b = ['zeros', 'empty', 'zero_', 'fill_']
+    assert [e.name for e in pa['b']] == expected_b
+
+    expected_c = ['add', 'empty']
+    assert [e.name for e in pa['c']] == expected_c

From f6ae283a687c785555977e6147e0fbabea5666d2 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 09:03:41 +0000
Subject: [PATCH 15/28] update docstring

---
 pytorch_lightning/profiler/profilers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 927942e432354..968b0a60f9f3c 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -289,10 +289,6 @@ def __del__(self):
 
 
 class PyTorchProfiler(BaseProfiler):
-    """
-    This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
-    different operators inside your model - both on the CPU and GPU
-    """
 
     PROFILER_OVERHEAD_MAX_TOLERANCE = 7.5e-4
     PROFILED_FUNCTIONS = ["training_step_and_backward", "validation_step", "test_step"]
@@ -321,6 +317,10 @@ def __init__(
         profiled_functions: Optional[List] = None,
     ):
         """
+
+        This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
+        different operators inside your model - both on the CPU and GPU
+
         Args:
 
             output_filename: optionally save profile results to file instead of printing

From f0aed961e57c188b1654753944cfe10fbdf597b8 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 09:08:30 +0000
Subject: [PATCH 16/28] update docstring

---
 pytorch_lightning/profiler/__init__.py  | 5 +++--
 pytorch_lightning/profiler/profilers.py | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index ef8189717a02f..c133c4c2d7396 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -50,7 +50,7 @@
 
 
 Advanced Profiling
---------------------
+------------------
 
 If you want more information on the functions called during each event, you can use the `AdvancedProfiler`.
 This option uses Python's cProfiler_ to provide a report of time spent on *each* function called within your code.
@@ -116,7 +116,7 @@ def custom_processing_step(self, data):
 
 
 PyTorch Profiling
---------------------
+-----------------
 
 Autograd includes a profiler that lets you inspect the cost of different operators
 inside your model - both on the CPU and GPU.
@@ -138,6 +138,7 @@ def custom_processing_step(self, data):
 
 This profiler will record only for `training_step_and_backward`, `evaluation_step` and `test_step` functions by default.
 The output below shows the profiling for the action `training_step_and_backward`.
+The user can provide ``PyTorchProfiler(profiled_functions=[...])`` to extend the scope of profiled functions.
 
 .. code-block:: python
 
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 968b0a60f9f3c..42533565fa45d 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -344,10 +344,10 @@ def __init__(
             use_cpu: use_kineto=True and can be used to lower the overhead for GPU-only profiling (1.8.0)
 
             emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
-                * Run: nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+                Run: nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
                 To visualize, you can either use:
-                    * nvvp trace_name.prof
-                    * torch.autograd.profiler.load_nvprof(path)
+                    nvvp trace_name.prof
+                    torch.autograd.profiler.load_nvprof(path)
 
             export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
 
@@ -355,7 +355,7 @@ def __init__(
                 where the file being is being run.
 
             row_limit: Limit the number of rows in a table, `0` is a special value that
-                 removes the limit completely.
+                removes the limit completely.
 
             sort_by_key: Keys to sort out profiled table
 

From 5dd2b4df9545aae725bc2e14cffcae2ea7e5a1eb Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Fri, 22 Jan 2021 10:30:49 +0100
Subject: [PATCH 17/28] try

---
 pytorch_lightning/profiler/profilers.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 42533565fa45d..1fe20c9c0189e 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -317,7 +317,6 @@ def __init__(
         profiled_functions: Optional[List] = None,
     ):
         """
-
         This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
         different operators inside your model - both on the CPU and GPU
 
@@ -326,12 +325,12 @@ def __init__(
             output_filename: optionally save profile results to file instead of printing
                 to std out when training is finished.
 
-            enabled: Setting this to False makes this context manager a no-op. Default: True
+            enabled: Setting this to False makes this context manager a no-op.
 
             use_cuda: Enables timing of CUDA events as well using the cudaEvent API.
-                Adds approximately 4us of overhead to each tensor operation. Default: True
+                Adds approximately 4us of overhead to each tensor operation.
 
-            record_shapes:  If shapes recording is set, information about input dimensions will be collected.
+            record_shapes: If shapes recording is set, information about input dimensions will be collected.
 
             profile_memory: Whether to report memory usage, default: True (1.6.0)
 
@@ -344,8 +343,12 @@ def __init__(
             use_cpu: use_kineto=True and can be used to lower the overhead for GPU-only profiling (1.8.0)
 
             emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
-                Run: nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
-                To visualize, you can either use:
+                Run::
+
+                    nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+
+                To visualize, you can either use::
+
                     nvvp trace_name.prof
                     torch.autograd.profiler.load_nvprof(path)
 

From 03b3ea5fc804d126185e26e4e7f6c46f7ea9e10d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 09:38:18 +0000
Subject: [PATCH 18/28] update test

---
 tests/trainer/test_trainer.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index a0523a2927256..de11aec9e7d42 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -17,6 +17,7 @@
 import sys
 from argparse import Namespace
 from copy import deepcopy
+from distutils.version import LooseVersion
 from pathlib import Path
 from unittest.mock import ANY, call, patch
 
@@ -1526,11 +1527,23 @@ def test_pytorch_profiler_nested(tmpdir):
 
     pa = pytorch_profiler.profiled_actions
 
-    expected_a = ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty']
-    assert [e.name for e in pa['a']] == expected_a
+    # From PyTorch 1.6.0, more operation are being traced.
+    if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+        expected_a = ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty']
+        assert [e.name for e in pa['a']] == expected_a
 
-    expected_b = ['zeros', 'empty', 'zero_', 'fill_']
-    assert [e.name for e in pa['b']] == expected_b
+        expected_b = ['zeros', 'empty', 'zero_', 'fill_']
+        assert [e.name for e in pa['b']] == expected_b
 
-    expected_c = ['add', 'empty']
-    assert [e.name for e in pa['c']] == expected_c
+        expected_c = ['add', 'empty']
+        assert [e.name for e in pa['c']] == expected_c
+
+    else:
+        expected_a = ['add']
+        assert [e.name for e in pa['a']] == expected_a
+
+        expected_b = []
+        assert [e.name for e in pa['b']] == expected_b
+
+        expected_c = ['add']
+        assert [e.name for e in pa['c']] == expected_c

From 1e6a9535b6344ac9be31eb9b0e710b8469cad60c Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 10:21:19 +0000
Subject: [PATCH 19/28] Update pytorch_lightning/profiler/__init__.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/profiler/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index c133c4c2d7396..e3d7a8a1ab973 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -171,9 +171,9 @@ def custom_processing_step(self, data):
     ---------------------  ---------------  ---------------  ---------------  ---------------  ---------------
     Self CPU time total: 1.681ms
 
-When running with `PyTorchProfiler(emit_nvtx=True)`. You should run as following:
+When running with `PyTorchProfiler(emit_nvtx=True)`. You should run as following::
 
-nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+    nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
 
 To visualize the profiled operation, you can either:
 

From 21ae2da97e62c561eae28520c47ca76a6f174f3f Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 10:21:32 +0000
Subject: [PATCH 20/28] Update pytorch_lightning/profiler/__init__.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/profiler/__init__.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index e3d7a8a1ab973..7398b21c6c1b8 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -177,9 +177,13 @@ def custom_processing_step(self, data):
 
 To visualize the profiled operation, you can either:
 
-* Use: nvvp trace_name.prof
+* Use::
 
-* Use: python -c 'import torch; print(torch.autograd.profiler.load_nvprof("trace_name.prof"))'
+    nvvp trace_name.prof
+
+* Use::
+
+     python -c 'import torch; print(torch.autograd.profiler.load_nvprof("trace_name.prof"))'
 
 """
 

From f6f0d890770807f17724ac3f7e1274df5e85c39b Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 10:34:23 +0000
Subject: [PATCH 21/28] update on comments

---
 pytorch_lightning/profiler/profilers.py | 124 +++++++++++++-----------
 tests/trainer/test_trainer.py           |  27 ++----
 2 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 1fe20c9c0189e..f9551fe90ced9 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -290,13 +290,12 @@ def __del__(self):
 
 class PyTorchProfiler(BaseProfiler):
 
-    PROFILER_OVERHEAD_MAX_TOLERANCE = 7.5e-4
-    PROFILED_FUNCTIONS = ["training_step_and_backward", "validation_step", "test_step"]
-    AVAILABLE_SORT_KEYS = [
+    PROFILED_FUNCTIONS = ("training_step_and_backward", "validation_step", "test_step")
+    AVAILABLE_SORT_KEYS = (
         "cpu_time", "cuda_time", "cpu_time_total",
         "cuda_time_total", "cpu_memory_usage", "cuda_memory_usage",
         "self_cpu_memory_usage", "self_cuda_memory_usage", "count"
-    ]
+    )
 
     def __init__(
         self,
@@ -332,15 +331,16 @@ def __init__(
 
             record_shapes: If shapes recording is set, information about input dimensions will be collected.
 
-            profile_memory: Whether to report memory usage, default: True (1.6.0)
+            profile_memory: Whether to report memory usage, default: True (Introduced in PyTorch 1.6.0)
 
             group_by_input_shapes: Include operator input shapes and group calls by shape.
 
-            with_stack: record source information (file and line number) for the ops (1.7.0)
+            with_stack: record source information (file and line number) for the ops (Introduced in PyTorch 1.7.0)
 
-            use_kineto: experimental support for Kineto profiler (1.8.0)
+            use_kineto: experimental support for Kineto profiler (Introduced in PyTorch 1.8.0)
 
-            use_cpu: use_kineto=True and can be used to lower the overhead for GPU-only profiling (1.8.0)
+            use_cpu: use_kineto=True and can be used to lower the overhead
+                for GPU-only profiling (Introduced in PyTorch 1.8.0)
 
             emit_nvtx: Context manager that makes every autograd operation emit an NVTX range
                 Run::
@@ -353,9 +353,10 @@ def __init__(
                     torch.autograd.profiler.load_nvprof(path)
 
             export_to_chrome: Wether to export the sequence of profiled operators for Chrome.
+                It will generate a ``.json`` file which can be read by Chrome.
 
-            path_to_export_trace: Directory path to export traces. By default, it will be save
-                where the file being is being run.
+            path_to_export_trace: Directory path to export ``.json`` traces when using ``export_to_chrome=True``.
+                By default, it will be save where the file being is being run.
 
             row_limit: Limit the number of rows in a table, `0` is a special value that
                 removes the limit completely.
@@ -386,8 +387,8 @@ def __init__(
 
         if export_to_chrome and path_to_export_trace is None:
             rank_zero_warn(
-                "The exported trace would be save locally as `path_to_export_trace` is empty"
-                "Note: Each functions will generate its own traced file. ")
+                "The exported trace would be save locally as `path_to_export_trace` is empty."
+                " Note: Each functions will generate its own traced file.")
 
         if self.sort_by_key not in self.AVAILABLE_SORT_KEYS:
             raise MisconfigurationException(
@@ -409,15 +410,16 @@ def __init__(
         super().__init__(output_streams=streaming_out)
 
     def start(self, action_name: str) -> None:
-        # stop the running profiler if any
-        if action_name in self.profiled_functions:
-            if len(self.running_stack) > 0:
-                self._stop(self.running_stack[-1])
-            self.running_stack.append(action_name)
+        if action_name not in self.profiled_functions:
+            return
+
+        if len(self.running_stack) > 0:
+            self._stop(self.running_stack[-1])
+        self.running_stack.append(action_name)
 
-            self.context_names[action_name] = "/".join(self.running_stack)
+        self.context_names[action_name] = "/".join(self.running_stack)
 
-            self._start(action_name)
+        self._start(action_name)
 
     def _start(self, action_name: str) -> None:
         if self.emit_nvtx:
@@ -455,51 +457,55 @@ def _stop(self, action_name: str) -> None:
                 self.profiled_actions[name] += function_events
 
     def stop(self, action_name: str) -> None:
-        if action_name in self.profiled_functions:
-            if len(self.running_stack) == 0 or self.running_stack[-1] != action_name:
-                raise ValueError(  # pragma: no-cover
-                    f"Attempting to stop recording an action ({action_name}) which was never started."
-                )
-            self._stop(action_name)
-            self.running_stack.pop()
-            # restore running profiler
-            if len(self.running_stack) > 0:
-                self._start(self.running_stack[-1])
+        if action_name not in self.profiled_functions:
+            return
+
+        if len(self.running_stack) == 0 or self.running_stack[-1] != action_name:
+            raise ValueError(  # pragma: no-cover
+                f"Attempting to stop recording an action ({action_name}) which was never started."
+            )
+        self._stop(action_name)
+        self.running_stack.pop()
+        # restore running profiler
+        if len(self.running_stack) > 0:
+            self._start(self.running_stack[-1])
 
     def summary(self) -> str:
         recorded_stats = {}
         output_string = ''
 
-        if self.enabled:
-            for action_name, function_events in self.profiled_actions.items():
-
-                # next line is a workaround for a pytorch issue (fixed on master, still present
-                # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
-                # parent event for detach`
-                function_events.populate_cpu_children = lambda: None
-
-                if self.export_to_chrome:
-                    filename = f"{action_name}_trace.json"
-                    path_to_trace = filename if self.path_to_export_trace is None \
-                        else os.path.join(self.path_to_export_trace, filename)
-                    function_events.export_chrome_trace(path_to_trace)
-
-                if self.emit_nvtx:
-                    return output_string
-
-                else:
-                    table = function_events.key_averages(
-                        group_by_input_shapes=self.group_by_input_shapes).table(
-                            sort_by=self.sort_by_key,
-                            row_limit=self.row_limit)
-                    recorded_stats[action_name] = table
-
-            # log to standard out
-            output_string = f"{os.linesep}Profiler Report{os.linesep}"
-            for action, stats in recorded_stats.items():
-                output_string += (
-                    f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
-                )
+        if not self.enabled:
+            return output_string
+
+        for action_name, function_events in self.profiled_actions.items():
+
+            # next line is a workaround for a pytorch issue (fixed on master, still present
+            # on 1.7). Without it the code fails with `AssertionError: There is already a CPU
+            # parent event for detach`
+            function_events.populate_cpu_children = lambda: None
+
+            if self.export_to_chrome:
+                filename = f"{action_name}_trace.json"
+                path_to_trace = filename if self.path_to_export_trace is None \
+                    else os.path.join(self.path_to_export_trace, filename)
+                function_events.export_chrome_trace(path_to_trace)
+
+            if self.emit_nvtx:
+                return output_string
+
+            else:
+                table = function_events.key_averages(
+                    group_by_input_shapes=self.group_by_input_shapes).table(
+                        sort_by=self.sort_by_key,
+                        row_limit=self.row_limit)
+                recorded_stats[action_name] = table
+
+        # log to standard out
+        output_string = f"{os.linesep}Profiler Report{os.linesep}"
+        for action, stats in recorded_stats.items():
+            output_string += (
+                f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
+            )
 
         return output_string
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index de11aec9e7d42..b5a970f236ac5 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1452,27 +1452,10 @@ def test_trainer_profiler_incorrect_arg_type(profiler):
 
 
 def _get_pytorch_profiler_total_duration(events):
-    total_time = sum([e.cpu_time + e.cuda_time for e in events])
+    total_time = sum([evt.cpu_time + evt.cuda_time for evt in events])
     return total_time / 1e6  # convert microseconds to seconds
 
 
-def test_pytorch_profiler_overhead(pytorch_profiler, n_iter=5):
-    """Ensure that the profiler doesn't introduce too much overhead during training."""
-    for _ in range(n_iter):
-        with pytorch_profiler.profile("test_step"):
-            a = torch.ones(42)
-            b = torch.abs(a)
-            _ = a + b
-
-    action_profile = pytorch_profiler.profiled_actions["test_step"]
-    total_duration = _get_pytorch_profiler_total_duration(action_profile)
-    average_duration = total_duration / n_iter
-    assert average_duration < pytorch_profiler.PROFILER_OVERHEAD_MAX_TOLERANCE
-    pytorch_profiler.describe()
-    data = Path(pytorch_profiler.output_fname).read_text()
-    assert len(data) > 0
-
-
 def test_pytorch_profiler_describe(tmpdir, pytorch_profiler):
     """Ensure the profiler won't fail when reporting the summary."""
     with pytorch_profiler.profile("test_step"):
@@ -1529,14 +1512,16 @@ def test_pytorch_profiler_nested(tmpdir):
 
     # From PyTorch 1.6.0, more operation are being traced.
     if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+        prefix_to_remove = "aten::" if LooseVersion(torch.__version__) >= LooseVersion("1.7.1") else ''
+
         expected_a = ['ones', 'empty', 'fill_', 'zeros', 'empty', 'zero_', 'fill_', 'add', 'empty']
-        assert [e.name for e in pa['a']] == expected_a
+        assert [e.name.replace(prefix_to_remove, '') for e in pa['a']] == expected_a
 
         expected_b = ['zeros', 'empty', 'zero_', 'fill_']
-        assert [e.name for e in pa['b']] == expected_b
+        assert [e.name.replace(prefix_to_remove, '') for e in pa['b']] == expected_b
 
         expected_c = ['add', 'empty']
-        assert [e.name for e in pa['c']] == expected_c
+        assert [e.name.replace(prefix_to_remove, '') for e in pa['c']] == expected_c
 
     else:
         expected_a = ['add']

From 2ea05de42ead7f77ff50187aa98ea87dc5767576 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 22 Jan 2021 10:35:47 +0000
Subject: [PATCH 22/28] remove old code

---
 tests/trainer/test_trainer.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index b5a970f236ac5..3437e59f2cf86 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1451,12 +1451,7 @@ def test_trainer_profiler_incorrect_arg_type(profiler):
         Trainer(profiler=profiler)
 
 
-def _get_pytorch_profiler_total_duration(events):
-    total_time = sum([evt.cpu_time + evt.cuda_time for evt in events])
-    return total_time / 1e6  # convert microseconds to seconds
-
-
-def test_pytorch_profiler_describe(tmpdir, pytorch_profiler):
+def test_pytorch_profiler_describe(pytorch_profiler):
     """Ensure the profiler won't fail when reporting the summary."""
     with pytorch_profiler.profile("test_step"):
         pass

From c397603069587f8353bbb8504fb9596c1cb93588 Mon Sep 17 00:00:00 2001
From: Ubuntu <thomas@grid.ai>
Date: Mon, 25 Jan 2021 10:22:17 +0000
Subject: [PATCH 23/28] add support for ddp

---
 pytorch_lightning/profiler/__init__.py        |  4 ++
 pytorch_lightning/profiler/profilers.py       | 43 ++++++++++++++++---
 .../trainer/connectors/profiler_connector.py  |  4 ++
 pytorch_lightning/trainer/training_loop.py    |  3 ++
 tests/special_tests.sh                        |  1 +
 tests/trainer/test_trainer.py                 | 35 ++++++++++++---
 6 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 7398b21c6c1b8..5c5cd65d3fd28 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -132,6 +132,10 @@ def custom_processing_step(self, data):
     profiler = PyTorchProfiler(...)
     trainer = Trainer(..., profiler=profiler)
 
+
+.. note:: This profiler works with DistributedDataParallel. If output_filename is provided, each rank will save the profiled operation to their own file.
+
+
 The profiler's results will be printed on the completion of a training `fit()`. This profiler
 report can be quite long, so you can also specify an `output_filename` to save the report instead
 of logging it to the output in your terminal.
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index f9551fe90ced9..2de6a1026fa1e 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -29,6 +29,7 @@
 import torch
 
 from pytorch_lightning import _logger as log
+from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -100,6 +101,10 @@ def summary(self) -> str:
         """Create profiler summary in text format."""
 
 
+    def on_train_start(self, local_rank: int):
+        self.local_rank = local_rank
+
+
 class PassThroughProfiler(BaseProfiler):
     """
     This class should be used when you don't want the (small) overhead of profiling.
@@ -314,6 +319,7 @@ def __init__(
         row_limit: int = 20,
         sort_by_key: Optional[str] = None,
         profiled_functions: Optional[List] = None,
+        local_rank: Optional[int] = None,
     ):
         """
         This profiler uses PyTorch's Autograd Profiler and lets you inspect the cost of
@@ -322,7 +328,9 @@ def __init__(
         Args:
 
             output_filename: optionally save profile results to file instead of printing
-                to std out when training is finished.
+                to std out when training is finished. When using ``ddp``, 
+                each rank will stream the profiled operation to their own file 
+                with the extension ``_{rank}.txt``
 
             enabled: Setting this to False makes this context manager a no-op.
 
@@ -365,12 +373,13 @@ def __init__(
 
             profiled_functions: list of profiled functions which will create a context manager on.
                 Any other will be pass through.
+
+            local_rank: When running in distributed setting, local_rank is used for each process 
+                to write to their own file if `output_fname` is provided.
         """
 
         self.profiled_actions = {}
-        # PyTorch Profiler doesn't seem to work with multiple processes
-        # todo: Try to find a solution
-        self.enabled = enabled and os.getenv("LOCAL_RANK", None) is None
+        self.enabled = enabled
         self.profiled_functions = profiled_functions or self.PROFILED_FUNCTIONS
         self.use_cuda = use_cuda
         self.record_shapes = record_shapes
@@ -401,14 +410,36 @@ def __init__(
 
         self.output_fname = output_filename
         self.output_file = None
+        self.local_rank = local_rank
+        if self.local_rank is not None:
+            self.on_fit_start(self.local_rank)
+            self.on_fit_start = super().on_fit_start
+
+    def on_train_start(self, local_rank: Optional[str] = None):
+        self.local_rank = local_rank
+
+        if local_rank != 0:
+            self.wrap_functions_into_rank_zero_only()
 
         if self.output_fname:
+            if local_rank is not None:
+                if '.txt' not in self.output_fname:
+                    raise MisconfigurationException("Log file should be .txt file.")
+            
+                self.output_fname = self.output_fname.replace(".txt", f"_{self.local_rank}.txt")
+
             fs = get_filesystem(self.output_fname)
             self.output_file = fs.open(self.output_fname, "w")
 
         streaming_out = [self.output_file.write] if self.output_file else [log.info]
         super().__init__(output_streams=streaming_out)
 
+    def wrap_functions_into_rank_zero_only(self):
+        self.start = rank_zero_only(self.start)
+        self.stop = rank_zero_only(self.stop)
+        self.summary = rank_zero_only(self.summary)
+        self.describe = rank_zero_only(self.describe)
+
     def start(self, action_name: str) -> None:
         if action_name not in self.profiled_functions:
             return
@@ -485,7 +516,7 @@ def summary(self) -> str:
             function_events.populate_cpu_children = lambda: None
 
             if self.export_to_chrome:
-                filename = f"{action_name}_trace.json"
+                filename = f"{action_name}_{self.local_rank}_trace.json"
                 path_to_trace = filename if self.path_to_export_trace is None \
                     else os.path.join(self.path_to_export_trace, filename)
                 function_events.export_chrome_trace(path_to_trace)
@@ -504,7 +535,7 @@ def summary(self) -> str:
         output_string = f"{os.linesep}Profiler Report{os.linesep}"
         for action, stats in recorded_stats.items():
             output_string += (
-                f"{os.linesep}Profile stats for: {action}{os.linesep}{stats}"
+                f"{os.linesep}Profile stats for: {action} rank: {self.local_rank} {os.linesep}{stats}"
             )
 
         return output_string
diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py
index d2e6ada35412e..2e66d2370e40f 100644
--- a/pytorch_lightning/trainer/connectors/profiler_connector.py
+++ b/pytorch_lightning/trainer/connectors/profiler_connector.py
@@ -58,3 +58,7 @@ def on_trainer_init(self, profiler: Union[BaseProfiler, bool, str]):
                 raise ValueError("When passing string value for the `profiler` parameter of"
                                  " `Trainer`, it can only be 'simple' or 'advanced'")
         self.trainer.profiler = profiler or PassThroughProfiler()
+
+    def on_train_start(self, trainer):
+        local_rank = trainer.local_rank if trainer.world_size > 1 else None
+        self.trainer.profiler.on_train_start(local_rank)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 1b07634908a7e..312a23f46c7ad 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -111,6 +111,9 @@ def on_train_start(self):
         # hook
         self.trainer.call_hook("on_train_start")
 
+        # provide rank to profiler
+        self.trainer.profile_connector.on_train_start(self.trainer)
+
     def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule):
         # bind logger and other properties
         self.trainer.model_connector.copy_trainer_model_properties(model)
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index ea14841c74bad..8650be6fd4682 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -23,3 +23,4 @@ python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequent
 python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection
 # python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance
 python ${DEFAULTS} tests/trainer/logging_process/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
+python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_trainer_ddp
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 3437e59f2cf86..01b9f72609e87 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -14,6 +14,7 @@
 import math
 import os
 import pickle
+from pytorch_lightning.accelerators import accelerator
 import sys
 from argparse import Namespace
 from copy import deepcopy
@@ -1473,19 +1474,43 @@ def test_pytorch_profiler_value_errors(pytorch_profiler):
     pytorch_profiler.stop(action)
 
 
-def test_pytorch_profiler_trainer(tmpdir):
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+                    reason="test should be run outside of pytest")
+@pytest.mark.parametrize("use_output_filename", [False, True])
+def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
     """Ensure that the profiler can be given to the training and default step are properly recorded. """
 
-    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
+    if use_output_filename:
+        output_filename = os.path.join(tmpdir, "profiler.txt")
+    else:
+        output_filename = None
+
+    profiler = PyTorchProfiler(output_filename=output_filename)
 
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        profiler=profiler
+        profiler=profiler,
+        accelerator="ddp",
+        gpus=2
+
     )
     trainer.fit(model)
-    assert len(profiler.summary()) > 0
-    assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
+
+    enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0
+    
+    if enabled:
+        assert len(profiler.summary()) > 0
+        assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
+    else:
+        assert profiler.summary() is None
+        assert set(profiler.profiled_actions.keys()) == set()
+    
+    if use_output_filename:
+        profiler.describe()
+        data = Path(profiler.output_fname).read_text()
+        assert len(data) > 0
 
 
 def test_pytorch_profiler_nested(tmpdir):

From 1db6e678d9d2c0233aeb486cd9c42cab698599e3 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 25 Jan 2021 10:25:11 +0000
Subject: [PATCH 24/28] resolve flake8

---
 pytorch_lightning/profiler/__init__.py  | 3 ++-
 pytorch_lightning/profiler/profilers.py | 9 ++++-----
 tests/trainer/test_trainer.py           | 5 ++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 5c5cd65d3fd28..63b4b013fdbf0 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -133,7 +133,8 @@ def custom_processing_step(self, data):
     trainer = Trainer(..., profiler=profiler)
 
 
-.. note:: This profiler works with DistributedDataParallel. If output_filename is provided, each rank will save the profiled operation to their own file.
+This profiler works with PyTorch ``DistributedDataParallel``.
+If ``output_filename`` is provided, each rank will save their profiled operation to their own file.
 
 
 The profiler's results will be printed on the completion of a training `fit()`. This profiler
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 2de6a1026fa1e..28478e7c66fed 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -100,7 +100,6 @@ def describe(self) -> None:
     def summary(self) -> str:
         """Create profiler summary in text format."""
 
-
     def on_train_start(self, local_rank: int):
         self.local_rank = local_rank
 
@@ -328,8 +327,8 @@ def __init__(
         Args:
 
             output_filename: optionally save profile results to file instead of printing
-                to std out when training is finished. When using ``ddp``, 
-                each rank will stream the profiled operation to their own file 
+                to std out when training is finished. When using ``ddp``,
+                each rank will stream the profiled operation to their own file
                 with the extension ``_{rank}.txt``
 
             enabled: Setting this to False makes this context manager a no-op.
@@ -374,7 +373,7 @@ def __init__(
             profiled_functions: list of profiled functions which will create a context manager on.
                 Any other will be pass through.
 
-            local_rank: When running in distributed setting, local_rank is used for each process 
+            local_rank: When running in distributed setting, local_rank is used for each process
                 to write to their own file if `output_fname` is provided.
         """
 
@@ -425,7 +424,7 @@ def on_train_start(self, local_rank: Optional[str] = None):
             if local_rank is not None:
                 if '.txt' not in self.output_fname:
                     raise MisconfigurationException("Log file should be .txt file.")
-            
+
                 self.output_fname = self.output_fname.replace(".txt", f"_{self.local_rank}.txt")
 
             fs = get_filesystem(self.output_fname)
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 01b9f72609e87..26b7befe0e974 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -14,7 +14,6 @@
 import math
 import os
 import pickle
-from pytorch_lightning.accelerators import accelerator
 import sys
 from argparse import Namespace
 from copy import deepcopy
@@ -1499,14 +1498,14 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename):
     trainer.fit(model)
 
     enabled = use_output_filename or not use_output_filename and profiler.local_rank == 0
-    
+
     if enabled:
         assert len(profiler.summary()) > 0
         assert set(profiler.profiled_actions.keys()) == {'training_step_and_backward', 'validation_step'}
     else:
         assert profiler.summary() is None
         assert set(profiler.profiled_actions.keys()) == set()
-    
+
     if use_output_filename:
         profiler.describe()
         data = Path(profiler.output_fname).read_text()

From e9866bbc2b43943787d782d412b737e51e738d1b Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Mon, 25 Jan 2021 15:22:54 +0000
Subject: [PATCH 25/28] Update pytorch_lightning/profiler/__init__.py

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 pytorch_lightning/profiler/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 63b4b013fdbf0..9d12deb31470b 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -145,6 +145,8 @@ def custom_processing_step(self, data):
 The output below shows the profiling for the action `training_step_and_backward`.
 The user can provide ``PyTorchProfiler(profiled_functions=[...])`` to extend the scope of profiled functions.
 
+.. note:: When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time. This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously. It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use the `SimpleProfiler`.
+
 .. code-block:: python
 
     Profiler Report

From d65beee811daeb1abaedc37d348dcfa0be4cf6d4 Mon Sep 17 00:00:00 2001
From: Ubuntu <thomas@grid.ai>
Date: Mon, 25 Jan 2021 15:30:29 +0000
Subject: [PATCH 26/28] resolve tests

---
 pytorch_lightning/profiler/profilers.py | 18 ++++++++++--------
 tests/trainer/test_trainer.py           |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 28478e7c66fed..4f881f3e217ff 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -100,7 +100,7 @@ def describe(self) -> None:
     def summary(self) -> str:
         """Create profiler summary in text format."""
 
-    def on_train_start(self, local_rank: int):
+    def on_train_start(self, local_rank: Optional[int] = None):
         self.local_rank = local_rank
 
 
@@ -409,15 +409,15 @@ def __init__(
 
         self.output_fname = output_filename
         self.output_file = None
-        self.local_rank = local_rank
-        if self.local_rank is not None:
-            self.on_fit_start(self.local_rank)
-            self.on_fit_start = super().on_fit_start
+        if local_rank is not None:
+            self.on_train_start(local_rank=local_rank)
+            self.on_train_start = super().on_train_start
 
     def on_train_start(self, local_rank: Optional[str] = None):
         self.local_rank = local_rank
 
-        if local_rank != 0:
+        # when logging to `log.info`, only perform profiling on rank 0
+        if local_rank != 0 and self.output_fname is None:
             self.wrap_functions_into_rank_zero_only()
 
         if self.output_fname:
@@ -504,6 +504,8 @@ def summary(self) -> str:
         recorded_stats = {}
         output_string = ''
 
+        local_rank = '0' if self.local_rank is None else self.local_rank 
+
         if not self.enabled:
             return output_string
 
@@ -515,7 +517,7 @@ def summary(self) -> str:
             function_events.populate_cpu_children = lambda: None
 
             if self.export_to_chrome:
-                filename = f"{action_name}_{self.local_rank}_trace.json"
+                filename = f"{action_name}_{local_rank}_trace.json"
                 path_to_trace = filename if self.path_to_export_trace is None \
                     else os.path.join(self.path_to_export_trace, filename)
                 function_events.export_chrome_trace(path_to_trace)
@@ -534,7 +536,7 @@ def summary(self) -> str:
         output_string = f"{os.linesep}Profiler Report{os.linesep}"
         for action, stats in recorded_stats.items():
             output_string += (
-                f"{os.linesep}Profile stats for: {action} rank: {self.local_rank} {os.linesep}{stats}"
+                f"{os.linesep}Profile stats for: {action} rank: {local_rank} {os.linesep}{stats}"
             )
 
         return output_string
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 26b7befe0e974..b9723878adad5 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -42,7 +42,7 @@
 
 @pytest.fixture
 def pytorch_profiler(tmpdir):
-    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"))
+    profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0)
     return profiler
 
 

From 8338c5ef398395da628d285d7b72a80bf2f412a8 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 25 Jan 2021 15:34:46 +0000
Subject: [PATCH 27/28] resolve flake8

---
 pytorch_lightning/core/memory.py        | 7 +++++--
 pytorch_lightning/profiler/__init__.py  | 2 +-
 pytorch_lightning/profiler/profilers.py | 3 +--
 tests/core/test_memory.py               | 7 +++++--
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index 4c1710cd36de0..9c30d6c5d6270 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -16,7 +16,7 @@
 import shutil
 import subprocess
 from collections import OrderedDict
-from typing import Tuple, Dict, Union, List, Any
+from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
 import torch
@@ -182,7 +182,8 @@ def __init__(self, model, mode: str = MODE_DEFAULT):
         self._model = model
         self._mode = mode
         self._layer_summary = self.summarize()
-        self._precision_megabytes = (self._model.precision / 8.0) * 1e-6 # 1 byte -> 8 bits
+        # 1 byte -> 8 bits
+        self._precision_megabytes = (self._model.precision / 8.0) * 1e-6
 
     @property
     def named_modules(self) -> List[Tuple[str, nn.Module]]:
@@ -389,9 +390,11 @@ def get_gpu_memory_map() -> Dict[str, int]:
     }
     return gpu_memory_map
 
+
 def get_formatted_model_size(total_model_size: float) -> float:
     return f"{total_model_size:,.3f}"
 
+
 def get_human_readable_count(number: int) -> str:
     """
     Abbreviates an integer number with K, M, B, T for thousands, millions,
diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 9d12deb31470b..546ed45e18263 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -145,7 +145,7 @@ def custom_processing_step(self, data):
 The output below shows the profiling for the action `training_step_and_backward`.
 The user can provide ``PyTorchProfiler(profiled_functions=[...])`` to extend the scope of profiled functions.
 
-.. note:: When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time. This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously. It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use the `SimpleProfiler`.
+.. note:: When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time. This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously. It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use the `SimpleProfiler`.   # noqa E501
 
 .. code-block:: python
 
diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py
index 4f881f3e217ff..a1221524faf4b 100644
--- a/pytorch_lightning/profiler/profilers.py
+++ b/pytorch_lightning/profiler/profilers.py
@@ -503,8 +503,7 @@ def stop(self, action_name: str) -> None:
     def summary(self) -> str:
         recorded_stats = {}
         output_string = ''
-
-        local_rank = '0' if self.local_rank is None else self.local_rank 
+        local_rank = '0' if self.local_rank is None else self.local_rank
 
         if not self.enabled:
             return output_string
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index 699b248013020..7e83c928b31e2 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -40,8 +40,10 @@ class PreCalculatedModel(BoringModel):
 
     def __init__(self, precision: int = 32):
         super().__init__()
-        self.layer = nn.Linear(32, 1000, bias=False) # 32K params
-        self.layer1 = nn.Linear(1000, 218, bias=False) # 218K params 
+        # 32K params
+        self.layer = nn.Linear(32, 1000, bias=False)
+        # 218K params
+        self.layer1 = nn.Linear(1000, 218, bias=False)
 
         # calculate model size based on precision.
         self.pre_calculated_model_size = 1.0 / (32 / precision)
@@ -50,6 +52,7 @@ def forward(self, x):
         x = self.layer(x)
         return self.layer1(x)
 
+
 class UnorderedModel(LightningModule):
     """ A model in which the layers not defined in order of execution """
 

From 9ae56ccd501c93c54634a9144bf1ddad2b2c6451 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 26 Jan 2021 08:56:23 +0000
Subject: [PATCH 28/28] resolve flake8

---
 tests/base/model_test_steps.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/base/model_test_steps.py b/tests/base/model_test_steps.py
index dfbbd7d2d31e6..db70959bfddef 100644
--- a/tests/base/model_test_steps.py
+++ b/tests/base/model_test_steps.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import random
 from abc import ABC
 from collections import OrderedDict