From ecd3678a1fdf234ac47dba3b56784a3c7467d83d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 11 Feb 2021 00:32:47 +0100
Subject: [PATCH 01/34] Refactor utilities/imports.py (#5874)

Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 pytorch_lightning/utilities/imports.py | 49 ++++++++++++++------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index eb9b924c702cc..aa86d560b6bfe 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -12,50 +12,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """General utilities"""
-import importlib
 import platform
 from distutils.version import LooseVersion
+from importlib.util import find_spec
 
 import pkg_resources
 import torch
 
 
 def _module_available(module_path: str) -> bool:
-    """Testing if given module is avalaible in your env
+    """
+    Check if a path is available in your environment
 
     >>> _module_available('os')
     True
     >>> _module_available('bla.bla')
     False
     """
-    # todo: find a better way than try / except
     try:
-        mods = module_path.split('.')
-        assert mods, 'nothing given to test'
-        # it has to be tested as per partets
-        for i in range(len(mods)):
-            module_path = '.'.join(mods[:i + 1])
-            if importlib.util.find_spec(module_path) is None:
-                return False
-        return True
+        return find_spec(module_path) is not None
     except AttributeError:
+        # Python 3.6
+        return False
+    except ModuleNotFoundError:
+        # Python 3.7+
         return False
 
 
+def _get_version(package: str) -> LooseVersion:
+    return LooseVersion(pkg_resources.get_distribution(package).version)
+
+
+_IS_WINDOWS = platform.system() == "Windows"
+_TORCH_GREATER_EQUAL_1_6 = _get_version("torch") >= LooseVersion("1.6.0")
+
 _APEX_AVAILABLE = _module_available("apex.amp")
-_NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")
-_OMEGACONF_AVAILABLE = _module_available("omegaconf")
+_BOLTS_AVAILABLE = _module_available('pl_bolts')
+_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available('fairscale.nn.data_parallel')
+_FAIRSCALE_PIPE_AVAILABLE = (
+    _FAIRSCALE_AVAILABLE and _TORCH_GREATER_EQUAL_1_6 and _get_version('fairscale') <= LooseVersion("0.1.3")
+)
+_GROUP_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.group')
+_HOROVOD_AVAILABLE = _module_available("horovod.torch")
 _HYDRA_AVAILABLE = _module_available("hydra")
 _HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental")
-_HOROVOD_AVAILABLE = _module_available("horovod.torch")
+_NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")
+_OMEGACONF_AVAILABLE = _module_available("omegaconf")
+_RPC_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.rpc')
 _TORCHTEXT_AVAILABLE = _module_available("torchtext")
-_XLA_AVAILABLE = _module_available("torch_xla")
-_FAIRSCALE_AVAILABLE = platform.system() != 'Windows' and _module_available('fairscale.nn.data_parallel')
-_RPC_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.rpc')
-_GROUP_AVAILABLE = platform.system() != 'Windows' and _module_available('torch.distributed.group')
-_FAIRSCALE_PIPE_AVAILABLE = _FAIRSCALE_AVAILABLE and LooseVersion(
-    torch.__version__
-) >= LooseVersion("1.6.0") and LooseVersion(pkg_resources.get_distribution('fairscale').version
-                                            ) <= LooseVersion("0.1.3")
-_BOLTS_AVAILABLE = _module_available('pl_bolts')
 _TORCHVISION_AVAILABLE = _module_available('torchvision')
+_XLA_AVAILABLE = _module_available("torch_xla")

From 7b00894130ed72ba789f1146fe3faef526f6dfb0 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Thu, 11 Feb 2021 00:05:59 +0000
Subject: [PATCH 02/34] [feat] Add StochasticWeightAveragingCallback (#5640)

* add swa callback

* switch back to 1.6.0

* remove optimizer_step

* move super

* update

* forgot update_parameters

* update on comments

* works for ddp

* resolve flake8

* remove set_model

* resolve flake8

* resolve cpu

* resolve flake8

* resolve flake8

* update

* update on comments
---
 docs/source/extensions/callbacks.rst       |   1 +
 pytorch_lightning/callbacks/__init__.py    |   4 +-
 pytorch_lightning/callbacks/swa.py         | 260 +++++++++++++++++++++
 pytorch_lightning/trainer/training_loop.py |   3 +-
 pytorch_lightning/utilities/__init__.py    |   1 +
 pytorch_lightning/utilities/imports.py     |   4 +-
 tests/callbacks/test_swa.py                | 159 +++++++++++++
 7 files changed, 428 insertions(+), 4 deletions(-)
 create mode 100644 pytorch_lightning/callbacks/swa.py
 create mode 100644 tests/callbacks/test_swa.py

diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index 1e3b04b65d4cc..b4e45042aca5b 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -106,6 +106,7 @@ Lightning has a few built-in callbacks.
     ModelPruning
     ProgressBar
     ProgressBarBase
+    StochasticWeightAveraging
 
 ----------
 
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index 043aef26af8bd..3d1c5b2d1c1e7 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.callbacks.progress import ProgressBar, ProgressBarBase
 from pytorch_lightning.callbacks.pruning import ModelPruning
+from pytorch_lightning.callbacks.swa import StochasticWeightAveraging
 
 __all__ = [
     'BackboneFinetuning',
@@ -32,7 +33,8 @@
     'LambdaCallback',
     'LearningRateMonitor',
     'ModelCheckpoint',
+    'ModelPruning',
     'ProgressBar',
     'ProgressBarBase',
-    'ModelPruning',
+    'StochasticWeightAveraging',
 ]
diff --git a/pytorch_lightning/callbacks/swa.py b/pytorch_lightning/callbacks/swa.py
new file mode 100644
index 0000000000000..762b57070d59c
--- /dev/null
+++ b/pytorch_lightning/callbacks/swa.py
@@ -0,0 +1,260 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""
+Stochastic Weight Averaging Callback
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+"""
+from copy import deepcopy
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6_0, rank_zero_warn
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+if _TORCH_GREATER_EQUAL_1_6_0:
+    from torch.optim.swa_utils import SWALR
+
+_AVG_FN = Callable[[torch.Tensor, torch.Tensor, torch.LongTensor], torch.FloatTensor]
+
+
+class StochasticWeightAveraging(Callback):
+
+    def __init__(
+        self,
+        swa_epoch_start: Union[int, float] = 0.8,
+        swa_lrs: Optional[Union[float, list]] = None,
+        annealing_epochs: int = 10,
+        annealing_strategy: str = "cos",
+        avg_fn: Optional[_AVG_FN] = None,
+        device: Optional[Union[torch.device, str]] = torch.device("cpu"),
+    ):
+        r"""
+
+        Implements the Stochastic Weight Averaging (SWA) Callback to average a model.
+
+        Stochastic Weight Averaging was proposed in ``Averaging Weights Leads to
+        Wider Optima and Better Generalization`` by Pavel Izmailov, Dmitrii
+        Podoprikhin, Timur Garipov, Dmitry Vetrov and Andrew Gordon Wilson
+        (UAI 2018).
+
+        This documentation is highly inspired by PyTorch's work on SWA.
+        The callback arguments follow the scheme defined in PyTorch's ``swa_utils`` package.
+
+        For a SWA explanation, please take a look
+        `here <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>`_.
+
+        .. warning:: ``StochasticWeightAveraging`` is in beta and subject to change.
+
+        .. warning:: ``StochasticWeightAveraging`` is currently not supported for multiple optimizers/schedulers.
+
+        Arguments:
+
+            swa_epoch_start: If provided as int, the procedure will start from
+                the ``swa_epoch_start``-th epoch. If provided as float between 0 and 1,
+                the procedure will start from ``int(swa_epoch_start * max_epochs)`` epoch
+
+            swa_lrs: the learning rate value for all param groups together or separately for each group.
+
+            annealing_epochs: number of epochs in the annealing phase (default: 10)
+
+            annealing_strategy: Specifies the annealing strategy (default: "cos"):
+
+                - ``"cos"``. For cosine annealing.
+                - ``"linear"`` For linear annealing
+
+            avg_fn: the averaging function used to update the parameters;
+                the function must take in the current value of the
+                :class:`AveragedModel` parameter, the current value of :attr:`model`
+                parameter and the number of models already averaged; if None,
+                equally weighted average is used (default: ``None``)
+
+            device: if provided, the averaged model will be stored on the ``device``.
+                When None is provided, it will infer the `device` from ``pl_module``.
+                (default: ``"cpu"``)
+
+        """
+
+        err_msg = "swa_epoch_start should be a >0 integer or a float between 0 and 1."
+        if isinstance(swa_epoch_start, int) and swa_epoch_start < 1:
+            raise MisconfigurationException(err_msg)
+        if isinstance(swa_epoch_start, float) and not (0 <= swa_epoch_start <= 1):
+            raise MisconfigurationException(err_msg)
+
+        if (
+            not isinstance(swa_lrs, (float, list)) or isinstance(swa_lrs, float) and swa_lrs <= 0
+            or isinstance(swa_lrs, list) and not all(lr > 0 and isinstance(lr, float) for lr in swa_lrs)
+        ):
+            raise MisconfigurationException("The `swa_lrs` should be a positive float or a list of positive float.")
+
+        if avg_fn is not None and not isinstance(avg_fn, Callable):
+            raise MisconfigurationException("The `avg_fn` should be callable.")
+
+        if device is not None and not isinstance(device, (torch.device, str)):
+            raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}")
+
+        self._swa_epoch_start = swa_epoch_start
+        self._swa_lrs = swa_lrs
+        self._annealing_epochs = annealing_epochs
+        self._annealing_strategy = annealing_strategy
+        self._avg_fn = avg_fn or self.avg_fn
+        self._device = device
+        self._model_contains_batch_norm = None
+        self._average_model = None
+
+    @property
+    def swa_start(self) -> int:
+        return max(self._swa_epoch_start - 1, 0)  # 0-based
+
+    @property
+    def swa_end(self) -> int:
+        return self._max_epochs - 1  # 0-based
+
+    @staticmethod
+    def pl_module_contains_batch_norm(pl_module: 'pl.LightningModule'):
+        return any(isinstance(module, nn.modules.batchnorm._BatchNorm) for module in pl_module.modules())
+
+    def on_before_accelerator_backend_setup(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'):
+        # copy the model before moving it to accelerator device.
+        self._average_model = deepcopy(pl_module)
+        optimizers = trainer.optimizers
+        lr_schedulers = trainer.lr_schedulers
+
+        if len(optimizers) > 1:
+            raise MisconfigurationException("SWA currently not supported for more than 1 `optimizer`.")
+
+        if len(lr_schedulers) > 1:
+            raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.")
+
+        if isinstance(self._swa_epoch_start, float):
+            self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start)
+
+        self._model_contains_batch_norm = self.pl_module_contains_batch_norm(pl_module)
+
+        self._max_epochs = trainer.max_epochs
+        if self._model_contains_batch_norm:
+            # virtually increase max_epochs to perform batch norm update on latest epoch.
+            trainer.max_epochs += 1
+
+    def on_train_epoch_start(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'):
+        if trainer.current_epoch == self.swa_start:
+            # move average model to request device.
+            self._average_model = self._average_model.to(self._device or pl_module.device)
+
+            optimizers = trainer.optimizers
+            lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
+
+            self._swa_scheduler = SWALR(
+                optimizers[0],
+                swa_lr=self._swa_lrs,
+                anneal_epochs=self._annealing_epochs,
+                anneal_strategy=self._annealing_strategy,
+                last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1
+            )
+
+            rank_zero_warn(f"Swapping lr_scheduler {lr_scheduler} for {self._swa_scheduler}")
+            trainer.lr_schedulers[0]["scheduler"] = self._swa_scheduler
+
+            self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device)
+
+        if self.swa_start <= trainer.current_epoch <= self.swa_end:
+            self.update_parameters(self._average_model, pl_module, self.n_averaged, self.avg_fn)
+
+        # Note: No > here in case the callback is saved with the model and training continues
+        if trainer.current_epoch == self.swa_end + 1:
+
+            # Transfer weights from average model to pl_module
+            self.transfer_weights(self._average_model, pl_module)
+
+            # Reset BatchNorm for update
+            self.reset_batch_norm_and_save_state(pl_module)
+
+            # There is no need to perform either backward or optimizer.step as we are
+            # performing only one pass over the train data-loader to compute activation statistics
+            # Therefore, we will virtually increase `num_training_batches` by 1 and skip backward.
+            trainer.num_training_batches += 1
+            trainer.train_loop._skip_backward = True
+            self._accumulate_grad_batches = trainer.accumulate_grad_batches
+            trainer.accumulate_grad_batches = len(trainer.train_dataloader)
+
+    def on_train_epoch_end(self, trainer: 'pl.Trainer', *args):
+        trainer.train_loop._skip_backward = False
+
+    def on_train_end(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule'):
+        if self._model_contains_batch_norm and trainer.current_epoch == self.swa_end + 1:
+            # BatchNorm epoch update. Reset state
+            trainer.accumulate_grad_batches = self._accumulate_grad_batches
+            trainer.num_training_batches -= 1
+            trainer.max_epochs -= 1
+            self.reset_momenta()
+        elif trainer.current_epoch == self.swa_end:
+            # Last SWA epoch. Transfer weights from average model to pl_module
+            self.transfer_weights(self._average_model, pl_module)
+
+    @staticmethod
+    def transfer_weights(src_pl_module: 'pl.LightningModule', dst_pl_module: 'pl.LightningModule'):
+        for src_param, dst_param in zip(src_pl_module.parameters(), dst_pl_module.parameters()):
+            dst_param.detach().copy_(src_param.to(dst_param.device))
+
+    def reset_batch_norm_and_save_state(self, pl_module: 'pl.LightningModule'):
+        """
+        Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L140-L154
+        """
+        self.momenta = {}
+        for module in pl_module.modules():
+            if not isinstance(module, nn.modules.batchnorm._BatchNorm):
+                continue
+            module.running_mean = torch.zeros_like(
+                module.running_mean, device=pl_module.device, dtype=module.running_mean.dtype
+            )
+            module.running_var = torch.ones_like(
+                module.running_var, device=pl_module.device, dtype=module.running_var.dtype
+            )
+            self.momenta[module] = module.momentum
+            module.momentum = None
+            module.num_batches_tracked *= 0
+
+    def reset_momenta(self):
+        """
+        Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165
+        """
+        for bn_module in self.momenta.keys():
+            bn_module.momentum = self.momenta[bn_module]
+
+    @staticmethod
+    def update_parameters(
+        average_model: 'pl.LightningModule', model: 'pl.LightningModule', n_averaged: torch.LongTensor, avg_fn: _AVG_FN
+    ):
+        """
+        Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L104-L112
+        """
+        for p_swa, p_model in zip(average_model.parameters(), model.parameters()):
+            device = p_swa.device
+            p_swa_ = p_swa.detach()
+            p_model_ = p_model.detach().to(device)
+            src = p_model_ if n_averaged == 0 else avg_fn(p_swa_, p_model_, n_averaged.to(device))
+            p_swa_.copy_(src)
+        n_averaged += 1
+
+    @staticmethod
+    def avg_fn(
+        averaged_model_parameter: torch.Tensor, model_parameter: torch.Tensor, num_averaged: torch.LongTensor
+    ) -> torch.FloatTensor:
+        """
+        Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97
+        """
+        return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 6bc6ace7d7f84..778e1e7e1051e 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -47,6 +47,7 @@ def __init__(self, trainer, multiple_trainloader_mode):
         self._curr_step_result = None
         self._cur_grad_norm_dict = None
         self._multiple_trainloader_mode = multiple_trainloader_mode
+        self._skip_backward = False
         self.trainer._multiple_trainloader_mode = multiple_trainloader_mode
 
     def on_trainer_init(
@@ -800,7 +801,7 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
                 self.warning_cache.warn("training_step returned None if it was on purpose, ignore this warning...")
                 return None
 
-            if self.trainer.train_loop.automatic_optimization:
+            if not self._skip_backward and self.trainer.train_loop.automatic_optimization:
                 # backward pass
                 with self.trainer.profiler.profile("model_backward"):
                     self.backward(result, optimizer, opt_idx)
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 78ad85bd9cd89..4c61ad29df7b8 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -36,6 +36,7 @@
     _NATIVE_AMP_AVAILABLE,
     _OMEGACONF_AVAILABLE,
     _RPC_AVAILABLE,
+    _TORCH_GREATER_EQUAL_1_6_0,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index aa86d560b6bfe..32aad7b29e12b 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -44,13 +44,13 @@ def _get_version(package: str) -> LooseVersion:
 
 
 _IS_WINDOWS = platform.system() == "Windows"
-_TORCH_GREATER_EQUAL_1_6 = _get_version("torch") >= LooseVersion("1.6.0")
+_TORCH_GREATER_EQUAL_1_6_0 = _get_version("torch") >= LooseVersion("1.6.0")
 
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
 _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available('fairscale.nn.data_parallel')
 _FAIRSCALE_PIPE_AVAILABLE = (
-    _FAIRSCALE_AVAILABLE and _TORCH_GREATER_EQUAL_1_6 and _get_version('fairscale') <= LooseVersion("0.1.3")
+    _FAIRSCALE_AVAILABLE and _TORCH_GREATER_EQUAL_1_6_0 and _get_version('fairscale') <= LooseVersion("0.1.3")
 )
 _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.group')
 _HOROVOD_AVAILABLE = _module_available("horovod.torch")
diff --git a/tests/callbacks/test_swa.py b/tests/callbacks/test_swa.py
new file mode 100644
index 0000000000000..b12e20f1c2a01
--- /dev/null
+++ b/tests/callbacks/test_swa.py
@@ -0,0 +1,159 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import platform
+from unittest import mock
+
+import pytest
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+
+from pytorch_lightning import Trainer
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6_0
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.helpers import BoringModel, RandomDataset
+
+if _TORCH_GREATER_EQUAL_1_6_0:
+    from pytorch_lightning.callbacks import StochasticWeightAveraging
+
+    class SwaTestModel(BoringModel):
+
+        def __init__(self, batchnorm: bool = True):
+            super().__init__()
+            layers = [nn.Linear(32, 32)]
+            if batchnorm:
+                layers.append(nn.BatchNorm1d(32))
+            layers += [nn.ReLU(), nn.Linear(32, 2)]
+            self.layer = nn.Sequential(*layers)
+
+        def training_step(self, batch, batch_idx):
+            output = self.forward(batch)
+            loss = self.loss(batch, output)
+            return {"loss": loss}
+
+        def train_dataloader(self):
+            return DataLoader(RandomDataset(32, 64), batch_size=2)
+
+    class SwaTestCallback(StochasticWeightAveraging):
+        update_parameters_calls: int = 0
+        transfer_weights_calls: int = 0
+
+        def update_parameters(self, *args, **kwargs):
+            self.update_parameters_calls += 1
+            return StochasticWeightAveraging.update_parameters(*args, **kwargs)
+
+        def transfer_weights(self, *args, **kwargs):
+            self.transfer_weights_calls += 1
+            return StochasticWeightAveraging.transfer_weights(*args, **kwargs)
+
+        def on_train_epoch_start(self, trainer, *args):
+            super().on_train_epoch_start(trainer, *args)
+            assert trainer.train_loop._skip_backward == (trainer.current_epoch > self.swa_end)
+
+        def on_train_epoch_end(self, trainer, *args):
+            super().on_train_epoch_end(trainer, *args)
+            if self.swa_start <= trainer.current_epoch <= self.swa_end:
+                swa_epoch = trainer.current_epoch - self.swa_start
+                assert self.n_averaged == swa_epoch + 1
+            elif trainer.current_epoch > self.swa_end:
+                assert self.n_averaged == self._max_epochs - self.swa_start
+
+        def on_train_end(self, trainer, pl_module):
+            super().on_train_end(trainer, pl_module)
+
+            # make sure these are correctly set again
+            assert not trainer.train_loop._skip_backward
+            assert trainer.accumulate_grad_batches == 2
+            assert trainer.num_training_batches == 5
+
+            # check backward call count. the batchnorm update epoch should not backward
+            assert trainer.dev_debugger.count_events(
+                "backward_call"
+            ) == trainer.max_epochs * trainer.limit_train_batches
+
+            # check call counts
+            assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1)
+            assert self.transfer_weights_calls == 1
+
+
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
+def train_with_swa(tmpdir, batchnorm=True, accelerator=None, gpus=None, num_processes=1):
+    model = SwaTestModel(batchnorm=batchnorm)
+    swa_start = 2
+    max_epochs = 5
+    swa_callback = SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1)
+    assert swa_callback.update_parameters_calls == 0
+    assert swa_callback.transfer_weights_calls == 0
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=max_epochs,
+        limit_train_batches=5,
+        limit_val_batches=0,
+        callbacks=[swa_callback],
+        accumulate_grad_batches=2,
+        accelerator=accelerator,
+        gpus=gpus,
+        num_processes=num_processes
+    )
+    trainer.fit(model)
+
+    # check the model is the expected
+    assert trainer.get_model() == model
+
+
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
+def test_swa_callback_ddp(tmpdir):
+    train_with_swa(tmpdir, accelerator="ddp", gpus=2)
+
+
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_swa_callback_ddp_spawn(tmpdir):
+    train_with_swa(tmpdir, accelerator="ddp_spawn", gpus=2)
+
+
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(platform.system() == "Windows", reason="ddp_cpu is not available on Windows")
+def test_swa_callback_ddp_cpu(tmpdir):
+    train_with_swa(tmpdir, accelerator="ddp_cpu", num_processes=2)
+
+
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU machine")
+def test_swa_callback_1_gpu(tmpdir):
+    train_with_swa(tmpdir, gpus=1)
+
+
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.parametrize("batchnorm", (True, False))
+def test_swa_callback(tmpdir, batchnorm):
+    train_with_swa(tmpdir, batchnorm=batchnorm)
+
+
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+def test_swa_raises():
+    with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
+        StochasticWeightAveraging(swa_epoch_start=0, swa_lrs=0.1)
+    with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
+        StochasticWeightAveraging(swa_epoch_start=1.5, swa_lrs=0.1)
+    with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
+        StochasticWeightAveraging(swa_epoch_start=-1, swa_lrs=0.1)
+    with pytest.raises(MisconfigurationException, match="positive float or a list of positive float"):
+        StochasticWeightAveraging(swa_epoch_start=5, swa_lrs=[0.2, 1])

From e8190e8848e1c9c5012da01e8000828e23eb2434 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 11 Feb 2021 01:16:53 +0100
Subject: [PATCH 03/34] Convert progress bar metrics to float (#5692)

* MetricsHolder(to_float=True)

* Update CHANGELOG

* Update tests/callbacks/test_progress_bar.py

* flake8

Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
---
 CHANGELOG.md                                  |  5 +++-
 .../logger_connector/logger_connector.py      |  6 ++---
 tests/callbacks/test_progress_bar.py          | 25 +++++++++++++++++++
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 38d97c984c6da..6eed2a9a1388a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -103,7 +103,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed the default of `find_unused_parameters` to `False` in DDP ([#5185](https://github.com/PyTorchLightning/pytorch-lightning/pull/5185))
 
 
-- Changed `ModelCheckpoint` version suffixes to start at 1 ([5008](https://github.com/PyTorchLightning/pytorch-lightning/pull/5008))
+- Changed `ModelCheckpoint` version suffixes to start at 1 ([#5008](https://github.com/PyTorchLightning/pytorch-lightning/pull/5008))
+
+
+- Progress bar metrics tensors are now converted to float ([#5692](https://github.com/PyTorchLightning/pytorch-lightning/pull/5692))
 
 
 - Changed the default value for the `progress_bar_refresh_rate` Trainer argument in Google COLAB notebooks to 20 ([#5516](https://github.com/PyTorchLightning/pytorch-lightning/pull/5516))
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 87caec7248208..439e9046726ce 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -14,7 +14,7 @@
 import os
 from copy import deepcopy
 from pprint import pprint
-from typing import Any, Dict, Iterable, Union
+from typing import Dict, Iterable, Union
 
 import torch
 
@@ -37,7 +37,7 @@ def __init__(self, trainer):
         self._callback_metrics = MetricsHolder()
         self._evaluation_callback_metrics = MetricsHolder(to_float=True)
         self._logged_metrics = MetricsHolder()
-        self._progress_bar_metrics = MetricsHolder()
+        self._progress_bar_metrics = MetricsHolder(to_float=True)
         self.eval_loop_results = []
         self._cached_results = {stage: EpochResultStore(trainer, stage) for stage in RunningStage}
         self._cached_results[None] = EpochResultStore(trainer, None)
@@ -88,7 +88,7 @@ def get_metrics(self, key: str) -> Dict:
         )
         return metrics_holder.metrics
 
-    def set_metrics(self, key: str, val: Any) -> None:
+    def set_metrics(self, key: str, val: Dict) -> None:
         metrics_holder = getattr(self, f"_{key}", None)
         metrics_holder.reset(val)
 
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index d8cf23c1105f0..5f861d7a2cce9 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -16,6 +16,7 @@
 from unittest.mock import call, Mock
 
 import pytest
+import torch
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint, ProgressBar, ProgressBarBase
@@ -349,3 +350,27 @@ def test_test_progress_bar_update_amount(tmpdir, test_batches, refresh_rate, tes
     )
     trainer.test(model)
     progress_bar.test_progress_bar.update.assert_has_calls([call(delta) for delta in test_deltas])
+
+
+def test_tensor_to_float_conversion(tmpdir):
+    """Check tensor gets converted to float"""
+
+    class TestModel(BoringModel):
+
+        def training_step(self, batch, batch_idx):
+            self.log('foo', torch.tensor(0.123), prog_bar=True)
+            self.log('bar', {"baz": torch.tensor([1])}, prog_bar=True)
+            return super().training_step(batch, batch_idx)
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=2,
+        logger=False,
+        checkpoint_callback=False,
+    )
+    trainer.fit(TestModel())
+
+    pbar = trainer.progress_bar_callback.main_progress_bar
+    actual = str(pbar.postfix)
+    assert actual.endswith("foo=0.123, bar={'baz': tensor([1])}")

From 9475c845cb29eab5a89163328c1837b2ff56b723 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 11 Feb 2021 11:22:07 +0100
Subject: [PATCH 04/34] Docs/fixes (#5914)

* wip

* ..

* ...

* Apply suggestions from code review

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
---
 Makefile                                |  2 +-
 docs/.build_docs.sh                     |  3 ---
 pytorch_lightning/callbacks/swa.py      |  4 ++--
 pytorch_lightning/utilities/__init__.py |  2 +-
 pytorch_lightning/utilities/imports.py  | 17 ++++++++++-------
 tests/callbacks/test_swa.py             | 16 ++++++++--------
 6 files changed, 22 insertions(+), 22 deletions(-)
 delete mode 100644 docs/.build_docs.sh

diff --git a/Makefile b/Makefile
index 35ae3ed8bdf85..a659d4a4b0229 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
 # to imitate SLURM set only single node
 export SLURM_LOCALID=0
 # assume you have installed need packages
-export SPHINX_MOCK_REQUIREMENTS=0
+export SPHINX_MOCK_REQUIREMENTS=1
 
 clean:
 	# clean all temp runs
diff --git a/docs/.build_docs.sh b/docs/.build_docs.sh
deleted file mode 100644
index 6cf6eab2fd398..0000000000000
--- a/docs/.build_docs.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-rm -rf source/generated
-make clean
-make html --debug --jobs 2 SPHINXOPTS="-W"
diff --git a/pytorch_lightning/callbacks/swa.py b/pytorch_lightning/callbacks/swa.py
index 762b57070d59c..2cd573e5a6fd5 100644
--- a/pytorch_lightning/callbacks/swa.py
+++ b/pytorch_lightning/callbacks/swa.py
@@ -23,10 +23,10 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6_0, rank_zero_warn
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-if _TORCH_GREATER_EQUAL_1_6_0:
+if _TORCH_GREATER_EQUAL_1_6:
     from torch.optim.swa_utils import SWALR
 
 _AVG_FN = Callable[[torch.Tensor, torch.Tensor, torch.LongTensor], torch.FloatTensor]
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 4c61ad29df7b8..01cef30cfd71e 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -36,7 +36,7 @@
     _NATIVE_AMP_AVAILABLE,
     _OMEGACONF_AVAILABLE,
     _RPC_AVAILABLE,
-    _TORCH_GREATER_EQUAL_1_6_0,
+    _TORCH_GREATER_EQUAL_1_6,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 32aad7b29e12b..f71b7887c5099 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """General utilities"""
+import operator
 import platform
 from distutils.version import LooseVersion
 from importlib.util import find_spec
 
-import pkg_resources
 import torch
+from pkg_resources import DistributionNotFound, get_distribution
 
 
 def _module_available(module_path: str) -> bool:
@@ -39,19 +40,21 @@ def _module_available(module_path: str) -> bool:
         return False
 
 
-def _get_version(package: str) -> LooseVersion:
-    return LooseVersion(pkg_resources.get_distribution(package).version)
+def _compare_version(package: str, op, version) -> bool:
+    try:
+        pkg_version = LooseVersion(get_distribution(package).version)
+        return op(pkg_version, LooseVersion(version))
+    except DistributionNotFound:
+        return False
 
 
 _IS_WINDOWS = platform.system() == "Windows"
-_TORCH_GREATER_EQUAL_1_6_0 = _get_version("torch") >= LooseVersion("1.6.0")
+_TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
 
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
 _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available('fairscale.nn.data_parallel')
-_FAIRSCALE_PIPE_AVAILABLE = (
-    _FAIRSCALE_AVAILABLE and _TORCH_GREATER_EQUAL_1_6_0 and _get_version('fairscale') <= LooseVersion("0.1.3")
-)
+_FAIRSCALE_PIPE_AVAILABLE = _TORCH_GREATER_EQUAL_1_6 and _compare_version("fairscale", operator.le, "0.1.3")
 _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.group')
 _HOROVOD_AVAILABLE = _module_available("horovod.torch")
 _HYDRA_AVAILABLE = _module_available("hydra")
diff --git a/tests/callbacks/test_swa.py b/tests/callbacks/test_swa.py
index b12e20f1c2a01..72a4c4fc1ab80 100644
--- a/tests/callbacks/test_swa.py
+++ b/tests/callbacks/test_swa.py
@@ -21,11 +21,11 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6_0
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel, RandomDataset
 
-if _TORCH_GREATER_EQUAL_1_6_0:
+if _TORCH_GREATER_EQUAL_1_6:
     from pytorch_lightning.callbacks import StochasticWeightAveraging
 
     class SwaTestModel(BoringModel):
@@ -114,7 +114,7 @@ def train_with_swa(tmpdir, batchnorm=True, accelerator=None, gpus=None, num_proc
     assert trainer.get_model() == model
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6, reason="SWA available from PyTorch 1.6.0")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
@@ -123,31 +123,31 @@ def test_swa_callback_ddp(tmpdir):
     train_with_swa(tmpdir, accelerator="ddp", gpus=2)
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6, reason="SWA available from PyTorch 1.6.0")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_swa_callback_ddp_spawn(tmpdir):
     train_with_swa(tmpdir, accelerator="ddp_spawn", gpus=2)
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6, reason="SWA available from PyTorch 1.6.0")
 @pytest.mark.skipif(platform.system() == "Windows", reason="ddp_cpu is not available on Windows")
 def test_swa_callback_ddp_cpu(tmpdir):
     train_with_swa(tmpdir, accelerator="ddp_cpu", num_processes=2)
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6, reason="SWA available from PyTorch 1.6.0")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires a GPU machine")
 def test_swa_callback_1_gpu(tmpdir):
     train_with_swa(tmpdir, gpus=1)
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6, reason="SWA available from PyTorch 1.6.0")
 @pytest.mark.parametrize("batchnorm", (True, False))
 def test_swa_callback(tmpdir, batchnorm):
     train_with_swa(tmpdir, batchnorm=batchnorm)
 
 
-@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6_0, reason="SWA available from PyTorch 1.6.0")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_6, reason="SWA available from PyTorch 1.6.0")
 def test_swa_raises():
     with pytest.raises(MisconfigurationException, match=">0 integer or a float between 0 and 1"):
         StochasticWeightAveraging(swa_epoch_start=0, swa_lrs=0.1)

From b434c479e7be787c49be2df381011bed3dc8f070 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 11 Feb 2021 13:04:57 +0100
Subject: [PATCH 05/34] Quantisation (#5706)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* empty

* sq

* obs


* int

* ts

* helpers

* chlog

* yapf

* avg

* dupl

* Apply suggestions from code review

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>

* Apply suggestions from code review

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* fixes

* Apply suggestions from code review

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* fixes

* note

* warn

* 45

* link

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* Apply suggestions from code review

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* yapf

* flake8

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 CHANGELOG.md                                  |   3 +
 pytorch_lightning/callbacks/__init__.py       |   3 +
 pytorch_lightning/callbacks/quantization.py   | 203 ++++++++++++++++++
 pytorch_lightning/core/lightning.py           |  11 +
 pytorch_lightning/core/memory.py              |   1 +
 .../metrics/functional/mean_relative_error.py |  54 +++++
 pytorch_lightning/trainer/data_loading.py     |   2 +-
 pytorch_lightning/utilities/__init__.py       |   2 +
 pytorch_lightning/utilities/imports.py        |   4 +-
 tests/__init__.py                             |  10 +
 tests/callbacks/test_quantization.py          | 136 ++++++++++++
 tests/core/test_results.py                    |   9 +-
 12 files changed, 429 insertions(+), 9 deletions(-)
 create mode 100644 pytorch_lightning/callbacks/quantization.py
 create mode 100644 pytorch_lightning/metrics/functional/mean_relative_error.py
 create mode 100644 tests/callbacks/test_quantization.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6eed2a9a1388a..e51b09899a1fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -77,6 +77,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added AUC/AUROC class interface ([#5479](https://github.com/PyTorchLightning/pytorch-lightning/pull/5479))
 
 
+- Added `QuantizationAwareTraining` callback ([#5706](https://github.com/PyTorchLightning/pytorch-lightning/pull/5706))
+
+
 ### Changed
 
 - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index 3d1c5b2d1c1e7..514782addd10c 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.callbacks.progress import ProgressBar, ProgressBarBase
 from pytorch_lightning.callbacks.pruning import ModelPruning
+from pytorch_lightning.callbacks.quantization import QuantizationAwareTraining
 from pytorch_lightning.callbacks.swa import StochasticWeightAveraging
 
 __all__ = [
@@ -36,5 +37,7 @@
     'ModelPruning',
     'ProgressBar',
     'ProgressBarBase',
+    'ModelPruning',
+    'QuantizationAwareTraining',
     'StochasticWeightAveraging',
 ]
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
new file mode 100644
index 0000000000000..f0458ff3b1369
--- /dev/null
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -0,0 +1,203 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""
+Quantization
+^^^^^^^^^^^^
+
+"""
+import functools
+from typing import Any, Callable, Optional, Sequence, Union
+
+import torch
+from torch.quantization import QConfig
+
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_LOWER_EQUAL_1_4
+
+
+def wrap_qat_forward_context(
+    quant_cb,
+    model: pl.core.LightningModule,
+    func: Callable,
+    trigger_condition: Optional[Union[Callable, int]] = None
+) -> Callable:
+    """
+    Decorator to wrap forward path as it is needed to quantize inputs and dequantize outputs for in/out compatibility
+    Moreover this version has the (de)quantization conditional as it may not be needed for the training all the time
+    """
+    # todo: consider using registering hook before/after forward
+    @functools.wraps(func)
+    def wrapper(data) -> Any:
+        _is_func_true = isinstance(trigger_condition, Callable) and trigger_condition(model.trainer)
+        _is_count_true = isinstance(trigger_condition, int) and quant_cb._forward_calls < trigger_condition
+        _quant_run = trigger_condition is None or _is_func_true or _is_count_true
+        # apply custom trigger
+        if _quant_run:
+            quant_cb._forward_calls += 1
+            data = model.quant(data)
+        data = func(data)
+        # apply custom trigger
+        if _quant_run:
+            data = model.dequant(data)
+        return data
+
+    return wrapper
+
+
+def wrap_quantize_forward_context(model: pl.core.LightningModule, func: Callable) -> Callable:
+    """
+    Decorator to wrap forward path as it is needed to quantize inputs and dequantize outputs for in/out compatibility
+    """
+    # todo: consider using registering hook before/after forward
+    @functools.wraps(func)
+    def wrapper(data) -> Any:
+        data = model.quant(data)
+        data = func(data)
+        data = model.dequant(data)
+        return data
+
+    return wrapper
+
+
+def _recursive_hasattr(obj: Any, attribs: str, state: bool = True) -> bool:
+    """recursive check if model has some layers denoted with '.'"""
+    if '.' in attribs:
+        attrib, attribs = attribs.split('.', 1)
+        if hasattr(obj, attrib):
+            return _recursive_hasattr(getattr(obj, attrib), attribs, state)
+        return False
+    return state and hasattr(obj, attribs)
+
+
+class QuantizationAwareTraining(Callback):
+    """
+    Quantization allows speeding up inference and decreasing memory requirements by performing computations
+     and storing tensors at lower bitwidths (such as INT8 or FLOAT16) than floating point precision.
+    We use native PyTorch API so for more information see
+     `Quantization <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>_`
+
+    .. warning:: ``QuantizationAwareTraining`` is in beta and subject to change.
+    """
+
+    OBSERVER_TYPES = ('histogram', 'average')
+
+    def __init__(
+        self,
+        qconfig: Union[str, QConfig] = 'fbgemm',
+        observer_type: str = "average",
+        collect_quantization: Optional[Union[int, Callable]] = None,
+        modules_to_fuse: Optional[Sequence] = None,
+        input_compatible: bool = True,
+    ) -> None:
+        """
+        Args:
+            qconfig: define quantization configuration see: `torch.quantization.QConfig
+             <https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig>_`
+                or use pre-defined: 'fbgemm' for server inference and 'qnnpack' for mobile inference
+            observer_type: allows switching between ``MovingAverageMinMaxObserver`` as "average" (default)
+                and ``HistogramObserver`` as "histogram" which is more computationally expensive
+            collect_quantization: count or custom function to collect quantization statistics
+
+                - with default ``None`` the quantization observer is called each module forward,
+                    typical use-case can be collecting extended statistic when user uses image/data augmentation
+                - custom call count to set a fixed number of calls, starting from the beginning
+                - custom ``Callable`` function with single trainer argument,
+                    see example when you limit call only for last epoch::
+
+                    def custom_trigger_last(trainer):
+                        return trainer.current_epoch == (trainer.max_epochs - 1)
+
+                    QuantizationAwareTraining(collect_quantization=custom_trigger_last)
+
+            modules_to_fuse: allows you fuse a few layers together as shown in `diagram
+             <https://pytorch.org/docs/stable/quantization.html#quantization-aware-training>_`
+                to find which layer types can be fused, check https://github.com/pytorch/pytorch/pull/43286
+            input_compatible: preserve quant/dequant layers. This allows to feat any input as to the original model,
+                but break compatibility to torchscript
+        """
+        if not isinstance(qconfig, (str, QConfig)):
+            raise MisconfigurationException(f"Unsupported qconfig: f{qconfig}.")
+        self._qconfig = qconfig
+
+        if observer_type not in self.OBSERVER_TYPES:
+            raise MisconfigurationException(
+                f'Unsupported observer type "{observer_type}", allowed are {self.OBSERVER_TYPES}.'
+            )
+        elif observer_type == 'histogram' and _TORCH_LOWER_EQUAL_1_4:
+            raise MisconfigurationException(f'For using {observer_type} you need to be using pytorch>=1.5.')
+        self._observer_type = observer_type
+
+        if collect_quantization is not None and not isinstance(collect_quantization, (int, Callable)):
+            raise MisconfigurationException(
+                f'Unsupported `collect_quantization` "{collect_quantization}", allowed are `int` or `Callable`.'
+            )
+        self._collect_quantization = collect_quantization
+
+        self.modules_to_fuse = modules_to_fuse
+        self._input_compatible = input_compatible
+        self._forward_calls = 0
+
+    def _check_feasible_fuse(self, model):
+        if not self.modules_to_fuse:
+            return False
+        for group in self.modules_to_fuse:
+            if not all(_recursive_hasattr(model, m) for m in group):
+                raise MisconfigurationException(
+                    f'You have requested to fuse {group} but one or more of them is not your model attributes'
+                )
+        return True
+
+    def on_fit_start(self, trainer, pl_module):
+        # QuantStub converts tensors from floating point to quantized
+        pl_module.quant = torch.quantization.QuantStub()
+        # DeQuantStub converts tensors from quantized to floating point
+        pl_module.dequant = torch.quantization.DeQuantStub()
+        # manually specify where tensors will be converted from quantized
+        # to floating point in the quantized model
+        self.__module_forward = pl_module.forward
+        pl_module.forward = wrap_qat_forward_context(
+            quant_cb=self, model=pl_module, func=pl_module.forward, trigger_condition=self._collect_quantization
+        )
+
+        # attach a global qconfig, which contains information about what kind
+        # of observers to attach. Use 'fbgemm' for server inference
+        if isinstance(self._qconfig, str):
+            if self._observer_type == 'histogram':
+                pl_module.qconfig = torch.quantization.get_default_qconfig(self._qconfig)
+            elif self._observer_type == 'average':
+                pl_module.qconfig = torch.quantization.get_default_qat_qconfig(self._qconfig)
+        elif isinstance(self._qconfig, QConfig):
+            pl_module.qconfig = self._qconfig
+
+        if self._check_feasible_fuse(pl_module):
+            torch.quantization.fuse_modules(pl_module, self.modules_to_fuse, inplace=True)
+
+        # Prepare the model for QAT. This inserts observers and fake_quants in
+        # the model that will observe weight and activation tensors during calibration.
+        torch.quantization.prepare_qat(pl_module, inplace=True)
+
+    def on_fit_end(self, trainer, pl_module):
+        pl_module.eval()
+        # Convert the observed model to a quantized model. This does several things:
+        # quantizes the weights, computes and stores the scale and bias value to be
+        # used with each activation tensor, fuses modules where appropriate,
+        # and replaces key operators with quantized implementations.
+        torch.quantization.convert(pl_module, inplace=True)
+        # check we shall preserve wrapper
+        if self._input_compatible:
+            pl_module.forward = wrap_quantize_forward_context(model=pl_module, func=self.__module_forward)
+        else:
+            pl_module.forward = self.__module_forward
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index cddb28be712ca..900c7e1e208fb 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -19,6 +19,7 @@
 import os
 import re
 import tempfile
+import uuid
 from abc import ABC
 from argparse import Namespace
 from functools import partial
@@ -69,6 +70,7 @@ class LightningModule(
         "global_rank",
         "local_rank",
         "logger",
+        "model_size",
     ] + DeviceDtypeModuleMixin.__jit_unused_properties__
 
     def __init__(self, *args, **kwargs):
@@ -1763,3 +1765,12 @@ def __get_hparams_assignment_variable(self):
             return "hparams"
 
         return None
+
+    @property
+    def model_size(self) -> float:
+        # todo: think about better way without need to dump model to drive
+        tmp_name = f"{uuid.uuid4().hex}.pt"
+        torch.save(self.state_dict(), tmp_name)
+        size_mb = os.path.getsize(tmp_name) / 1e6
+        os.remove(tmp_name)
+        return size_mb
diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index f087da3971d95..e05feff0db5bf 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -227,6 +227,7 @@ def trainable_parameters(self) -> int:
 
     @property
     def model_size(self) -> float:
+        # todo: seems it does not work with quantized models - it returns 0.0
         return self.total_parameters * self._precision_megabytes
 
     def summarize(self) -> Dict[str, LayerSummary]:
diff --git a/pytorch_lightning/metrics/functional/mean_relative_error.py b/pytorch_lightning/metrics/functional/mean_relative_error.py
new file mode 100644
index 0000000000000..eedaea1a26a4f
--- /dev/null
+++ b/pytorch_lightning/metrics/functional/mean_relative_error.py
@@ -0,0 +1,54 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+
+import torch
+
+from pytorch_lightning.metrics.utils import _check_same_shape
+
+
+def _mean_relative_error_update(preds: torch.Tensor, target: torch.Tensor) -> Tuple[torch.Tensor, int]:
+    _check_same_shape(preds, target)
+    target_nz = target.clone()
+    target_nz[target == 0] = 1
+    sum_rltv_error = torch.sum(torch.abs((preds - target) / target_nz))
+    n_obs = target.numel()
+    return sum_rltv_error, n_obs
+
+
+def _mean_relative_error_compute(sum_rltv_error: torch.Tensor, n_obs: int) -> torch.Tensor:
+    return sum_rltv_error / n_obs
+
+
+def mean_relative_error(preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes mean relative error
+
+    Args:
+        pred: estimated labels
+        target: ground truth labels
+
+    Return:
+        Tensor with mean relative error
+
+    Example:
+
+        >>> x = torch.tensor([0., 1, 2, 3])
+        >>> y = torch.tensor([0., 1, 2, 2])
+        >>> mean_relative_error(x, y)
+        tensor(0.1250)
+
+    """
+    sum_rltv_error, n_obs = _mean_relative_error_update(preds, target)
+    return _mean_relative_error_compute(sum_rltv_error, n_obs)
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 9586a18f592c5..f319dd6594140 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -195,7 +195,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         """
         self.train_dataloader = self.request_dataloader(model.train_dataloader)
 
-        if (self.overfit_batches > 0):
+        if self.overfit_batches > 0:
             if hasattr(self.train_dataloader, 'sampler') and isinstance(self.train_dataloader.sampler, RandomSampler):
                 rank_zero_warn(
                     'You requested to overfit but enabled training dataloader shuffling.'
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 01cef30cfd71e..94b08029b92c1 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -37,6 +37,8 @@
     _OMEGACONF_AVAILABLE,
     _RPC_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_6,
+    _TORCH_LOWER_EQUAL_1_4,
+    _TORCH_QUANTIZE_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
     _TORCHVISION_AVAILABLE,
     _XLA_AVAILABLE,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index f71b7887c5099..8ebcb570a394f 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -49,8 +49,10 @@ def _compare_version(package: str, op, version) -> bool:
 
 
 _IS_WINDOWS = platform.system() == "Windows"
-_TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
 
+_TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0")
+_TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
+_TORCH_QUANTIZE_AVAILABLE = _module_available('torch.ops.quantized')
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
 _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available('fairscale.nn.data_parallel')
diff --git a/tests/__init__.py b/tests/__init__.py
index 57feda6280c38..c9642003d6ceb 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -14,6 +14,9 @@
 import os
 
 import numpy as np
+import torch
+
+from pytorch_lightning.utilities import _TORCH_LOWER_EQUAL_1_4, _TORCH_QUANTIZE_AVAILABLE
 
 _TEST_ROOT = os.path.dirname(__file__)
 _PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
@@ -31,3 +34,10 @@
 
 if not os.path.isdir(_TEMP_PATH):
     os.mkdir(_TEMP_PATH)
+
+_SKIPIF_ARGS_PT_LE_1_4 = dict(condition=_TORCH_LOWER_EQUAL_1_4, reason="test pytorch > 1.4")
+_SKIPIF_ARGS_NO_GPU = dict(condition=not torch.cuda.is_available(), reason="test requires single-GPU machine")
+_SKIPIF_ARGS_NO_GPUS = dict(condition=torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+_SKIPIF_ARGS_NO_PT_QUANT = dict(
+    condition=not _TORCH_QUANTIZE_AVAILABLE, reason="PyTorch quantization is needed for this test"
+)
diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
new file mode 100644
index 0000000000000..620346c0bd504
--- /dev/null
+++ b/tests/callbacks/test_quantization.py
@@ -0,0 +1,136 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+import pytest
+import torch
+
+from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.callbacks import QuantizationAwareTraining
+from pytorch_lightning.metrics.functional.mean_relative_error import mean_relative_error
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests import _SKIPIF_ARGS_NO_PT_QUANT, _SKIPIF_ARGS_PT_LE_1_4
+from tests.helpers.datamodules import RegressDataModule
+from tests.helpers.simple_models import RegressionModel
+
+
+@pytest.mark.parametrize(
+    "observe", ['average', pytest.param('histogram', marks=pytest.mark.skipif(**_SKIPIF_ARGS_PT_LE_1_4))]
+)
+@pytest.mark.parametrize("fuse", [True, False])
+@pytest.mark.skipif(**_SKIPIF_ARGS_NO_PT_QUANT)
+def test_quantization(tmpdir, observe, fuse):
+    """Parity test for quant model"""
+    seed_everything(42)
+    dm = RegressDataModule()
+    trainer_args = dict(
+        default_root_dir=tmpdir,
+        max_epochs=10,
+        gpus=1 if torch.cuda.is_available() else None,
+    )
+    model = RegressionModel()
+    qmodel = copy.deepcopy(model)
+
+    trainer = Trainer(**trainer_args)
+    trainer.fit(model, datamodule=dm)
+    org_size = model.model_size
+    org_score = torch.mean(torch.tensor([mean_relative_error(model(x), y) for x, y in dm.test_dataloader()]))
+
+    fusing_layers = [(f'layer_{i}', f'layer_{i}a') for i in range(3)] if fuse else None
+    qcb = QuantizationAwareTraining(observer_type=observe, modules_to_fuse=fusing_layers)
+    trainer = Trainer(callbacks=[qcb], **trainer_args)
+    trainer.fit(qmodel, datamodule=dm)
+
+    quant_calls = qcb._forward_calls
+    assert quant_calls == qcb._forward_calls
+
+    quant_size = qmodel.model_size
+    quant_score = torch.mean(torch.tensor([mean_relative_error(qmodel(x), y) for x, y in dm.test_dataloader()]))
+    # test that the trained model is smaller then initial
+    size_ratio = quant_size / org_size
+    assert size_ratio < 0.65
+    # test that the test score is almost the same as with pure training
+    assert torch.allclose(org_score, quant_score, atol=0.45)
+
+
+@pytest.mark.skipif(**_SKIPIF_ARGS_NO_PT_QUANT)
+def test_quantize_torchscript(tmpdir):
+    """Test converting to torchscipt """
+    dm = RegressDataModule()
+    qmodel = RegressionModel()
+    qcb = QuantizationAwareTraining(input_compatible=False)
+    trainer = Trainer(callbacks=[qcb], default_root_dir=tmpdir, max_epochs=1)
+    trainer.fit(qmodel, datamodule=dm)
+
+    qmodel.to_torchscript()
+
+
+def test_quantization_exceptions(tmpdir):
+    """Test wrong fuse layers"""
+    with pytest.raises(MisconfigurationException, match='Unsupported qconfig'):
+        QuantizationAwareTraining(qconfig=['abc'])
+
+    with pytest.raises(MisconfigurationException, match='Unsupported observer type'):
+        QuantizationAwareTraining(observer_type='abc')
+
+    with pytest.raises(MisconfigurationException, match='Unsupported `collect_quantization`'):
+        QuantizationAwareTraining(collect_quantization='abc')
+
+    with pytest.raises(MisconfigurationException, match='Unsupported `collect_quantization`'):
+        QuantizationAwareTraining(collect_quantization=1.2)
+
+    fusing_layers = [(f'layers.mlp_{i}', f'layers.NONE-mlp_{i}a') for i in range(3)]
+    qcb = QuantizationAwareTraining(modules_to_fuse=fusing_layers)
+    trainer = Trainer(callbacks=[qcb], default_root_dir=tmpdir, max_epochs=1)
+    with pytest.raises(MisconfigurationException, match='one or more of them is not your model attributes'):
+        trainer.fit(RegressionModel(), datamodule=RegressDataModule())
+
+
+def custom_trigger_never(trainer):
+    return False
+
+
+def custom_trigger_even(trainer):
+    return trainer.current_epoch % 2 == 0
+
+
+def custom_trigger_last(trainer):
+    return trainer.current_epoch == (trainer.max_epochs - 1)
+
+
+@pytest.mark.parametrize(
+    "trigger_fn,expected_count", [
+        (None, 9),
+        (3, 3),
+        (custom_trigger_never, 0),
+        (custom_trigger_even, 5),
+        (custom_trigger_last, 2),
+    ]
+)
+@pytest.mark.skipif(**_SKIPIF_ARGS_NO_PT_QUANT)
+def test_quantization_triggers(tmpdir, trigger_fn, expected_count):
+    """Test  how many times the quant is called"""
+    dm = RegressDataModule()
+    qmodel = RegressionModel()
+    qcb = QuantizationAwareTraining(collect_quantization=trigger_fn)
+    trainer = Trainer(
+        callbacks=[qcb],
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=4,
+    )
+    trainer.fit(qmodel, datamodule=dm)
+
+    assert qcb._forward_calls == expected_count
diff --git a/tests/core/test_results.py b/tests/core/test_results.py
index 42215f8e11371..1793f3e7bbb30 100644
--- a/tests/core/test_results.py
+++ b/tests/core/test_results.py
@@ -25,6 +25,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.trainer.states import TrainerState
+from tests import _SKIPIF_ARGS_NO_GPU
 from tests.helpers import BoringDataModule, BoringModel
 
 
@@ -71,13 +72,7 @@ def test_result_reduce_ddp(result_cls):
         pytest.param(5, False, 0, id='nested_list_predictions'),
         pytest.param(6, False, 0, id='dict_list_predictions'),
         pytest.param(7, True, 0, id='write_dict_predictions'),
-        pytest.param(
-            0,
-            True,
-            1,
-            id='full_loop_single_gpu',
-            marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires single-GPU machine")
-        )
+        pytest.param(0, True, 1, id='full_loop_single_gpu', marks=pytest.mark.skipif(**_SKIPIF_ARGS_NO_GPU))
     ]
 )
 def test_result_obj_predictions(tmpdir, test_option, do_train, gpus):

From 8e9a026bc34d8409faa572a7144c2d96a7c039ed Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Thu, 11 Feb 2021 20:02:07 +0530
Subject: [PATCH 06/34] [tests/models] refactor with BoringModel (#5507)

* update with BoringModel

* update with BoringModel

* step

* try TPU

* TPU

* update tests

* update tpu tests

* self

* fix

* dp

* update tests

* ref

* update tests

* fix tpu tests

* fix dp and run_prediction

* dp

* only dp

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/core/test_datamodules.py                |  70 +++--
 tests/helpers/pipelines.py                    |  12 +-
 tests/helpers/simple_models.py                |  14 +-
 .../data/horovod/train_default_model.py       |   6 +-
 tests/models/test_amp.py                      |  33 ++-
 tests/models/test_cpu.py                      |  39 ++-
 tests/models/test_grad_norm.py                |  12 +-
 tests/models/test_hooks.py                    |  34 +--
 tests/models/test_horovod.py                  |  21 +-
 tests/models/test_hparams.py                  | 111 ++++----
 tests/models/test_onnx.py                     |  14 +-
 tests/models/test_restore.py                  | 252 +++++++++++-------
 tests/models/test_tpu.py                      |  82 +++---
 13 files changed, 397 insertions(+), 303 deletions(-)

diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 1bbbe7c40f088..76fdca0fedd48 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -18,13 +18,16 @@
 
 import pytest
 import torch
+import torch.nn.functional as F
 
 from pytorch_lightning import LightningDataModule, Trainer
 from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
 from tests.helpers import BoringDataModule, BoringModel
-from tests.helpers.utils import reset_seed
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
+from tests.helpers.utils import reset_seed, set_random_master_port
 
 
 def test_can_prepare_data(tmpdir):
@@ -190,8 +193,8 @@ def test_dm_pickle_after_init(tmpdir):
 def test_train_loop_only(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     model.validation_step = None
     model.validation_step_end = None
@@ -207,18 +210,17 @@ def test_train_loop_only(tmpdir):
     )
 
     # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert result
-    # TODO: add end-to-end test
-    # assert trainer.callback_metrics['loss'] < 0.6
+    assert trainer.callback_metrics['train_loss'] < 1.0
 
 
 def test_train_val_loop_only(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     model.validation_step = None
     model.validation_step_end = None
@@ -231,11 +233,10 @@ def test_train_val_loop_only(tmpdir):
     )
 
     # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert result
-    # TODO: add end-to-end test
-    # assert trainer.callback_metrics['train_loss'] < 0.6
+    assert trainer.callback_metrics['train_loss'] < 1.0
 
 
 def test_dm_checkpoint_save(tmpdir):
@@ -294,8 +295,8 @@ def test_test_loop_only(tmpdir):
 def test_full_loop(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -311,8 +312,7 @@ def test_full_loop(tmpdir):
 
     # test
     result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6
 
 
 def test_trainer_attached_to_dm(tmpdir):
@@ -346,8 +346,8 @@ def test_trainer_attached_to_dm(tmpdir):
 def test_full_loop_single_gpu(tmpdir):
     reset_seed()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -364,16 +364,37 @@ def test_full_loop_single_gpu(tmpdir):
 
     # test
     result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_full_loop_dp(tmpdir):
-    reset_seed()
+    set_random_master_port()
 
-    dm = BoringDataModule()
-    model = BoringModel()
+    class CustomClassificationModelDP(ClassificationModel):
+
+        def _step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            return {'logits': logits, 'y': y}
+
+        def training_step(self, batch, batch_idx):
+            _, y = batch
+            out = self._step(batch, batch_idx)
+            out['loss'] = F.cross_entropy(out['logits'], y)
+            return out
+
+        def validation_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step_end(self, outputs):
+            self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))
+
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -385,14 +406,13 @@ def test_full_loop_dp(tmpdir):
     )
 
     # fit model
-    result = trainer.fit(model, dm)
+    result = trainer.fit(model, datamodule=dm)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert result
 
     # test
     result = trainer.test(datamodule=dm)
-    # TODO: add end-to-end test
-    # assert result[0]['test_acc'] > 0.8
+    assert result[0]['test_acc'] > 0.6
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py
index f5d9823600ede..3f131ab055d98 100644
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@@ -102,9 +102,9 @@ def run_model_test(
 
 def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
     if isinstance(trained_model, BoringModel):
-        return _boring_model_run_prediction(trained_model, dataloader, dp, min_acc)
+        return _boring_model_run_prediction(trained_model, dataloader, min_acc)
     else:
-        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc)
+        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc=min_acc)
 
 
 def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50):
@@ -135,11 +135,15 @@ def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min
     assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})"
 
 
-def _boring_model_run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
+# TODO: This test compares a loss value with a min accuracy - complete non-sense!
+# create BoringModels that make actual predictions!
+def _boring_model_run_prediction(trained_model, dataloader, min_acc=0.25):
     # run prediction on 1 batch
+    trained_model.cpu()
     batch = next(iter(dataloader))
+
     with torch.no_grad():
         output = trained_model(batch)
-    acc = trained_model.loss(batch, output)
 
+    acc = trained_model.loss(batch, output)
     assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}"
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index ebc70690f49fa..9288a3c802276 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -51,18 +51,21 @@ def training_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
-        self.log('train_Acc', self.train_acc(logits, y), prog_bar=True)
+        self.log('train_loss', loss, prog_bar=True)
+        self.log('train_acc', self.train_acc(logits, y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
-        self.log('valid_Acc', self.valid_acc(logits, y), prog_bar=True)
+        self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('val_acc', self.valid_acc(logits, y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
-        self.log('test_Acc', self.test_acc(logits, y), prog_bar=True)
+        self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('test_acc', self.test_acc(logits, y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
@@ -98,15 +101,18 @@ def training_step(self, batch, batch_idx):
         x, y = batch
         out = self.forward(x)
         loss = F.mse_loss(out, y)
+        self.log('train_loss', loss, prog_bar=False)
         self.log('train_MSE', self.train_mse(out, y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         out = self.forward(x)
-        self.log('valid_MSE', self.valid_mse(out, y), prog_bar=True)
+        self.log('val_loss', F.mse_loss(out, y), prog_bar=False)
+        self.log('val_MSE', self.valid_mse(out, y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         out = self.forward(x)
+        self.log('test_loss', F.mse_loss(out, y), prog_bar=False)
         self.log('test_MSE', self.test_mse(out, y), prog_bar=True)
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 24ddbd24c439f..93a637dda1071 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -36,7 +36,7 @@
 else:
     print('You requested to import Horovod which is missing or not supported for your OS.')
 
-from tests.base import EvalModelTemplate  # noqa: E402
+from tests.helpers import BoringModel  # noqa: E402
 from tests.helpers.pipelines import run_prediction  # noqa: E402
 from tests.helpers.utils import reset_seed, set_random_master_port  # noqa: E402
 
@@ -53,7 +53,7 @@ def run_test_from_config(trainer_options):
     ckpt_path = trainer_options['weights_save_path']
     trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     trainer = Trainer(**trainer_options)
     trainer.fit(model)
@@ -66,7 +66,7 @@ def run_test_from_config(trainer_options):
         return
 
     # test model loading
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = BoringModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     # test new model accuracy
     test_loaders = model.test_dataloader()
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 8a5b6d005cec1..8d620bb563f2e 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -24,7 +24,7 @@
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 
 
 @pytest.mark.skip(reason='dp + amp not supported currently')  # TODO
@@ -41,7 +41,7 @@ def test_amp_single_gpu_dp(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -60,7 +60,7 @@ def test_amp_single_gpu_ddp_spawn(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -81,7 +81,7 @@ def test_amp_multi_gpu_dp(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -100,7 +100,7 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # tutils.run_model_test(trainer_options, model)
     trainer.fit(model)
 
@@ -108,13 +108,13 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@mock.patch.dict(os.environ, {"SLURM_LOCALID": "0"})
 def test_amp_gpu_ddp_slurm_managed(tmpdir):
     """Make sure DDP + AMP work."""
     # simulate setting slurm flags
     tutils.set_random_master_port()
-    os.environ['SLURM_LOCALID'] = str(0)
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -156,7 +156,7 @@ def test_cpu_model_with_amp(tmpdir):
         precision=16,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
         tpipes.run_model_test(trainer_options, model, on_gpu=False)
@@ -165,7 +165,7 @@ def test_cpu_model_with_amp(tmpdir):
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_amp_without_apex(tmpdir):
     """Check that even with apex amp type without requesting precision=16 the amp backend is void."""
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -190,19 +190,24 @@ def test_amp_without_apex(tmpdir):
 def test_amp_with_apex(tmpdir):
     """Check calling apex scaling in training."""
 
-    class CustomModel(EvalModelTemplate):
+    class CustomModel(BoringModel):
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            return super().training_step(batch, batch_idx)
 
         def configure_optimizers(self):
-            optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate)
-            optimizer2 = optim.SGD(self.parameters(), lr=self.learning_rate)
+            optimizer1 = optim.Adam(self.parameters(), lr=0.01)
+            optimizer2 = optim.SGD(self.parameters(), lr=0.01)
             lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1)
             lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1)
             return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2]
 
     model = CustomModel()
+    model.training_epoch_end = None
+
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1,
+        max_steps=5,
         precision=16,
         amp_backend='apex',
         gpus=1,
@@ -210,7 +215,7 @@ def configure_optimizers(self):
     assert str(trainer.amp_backend) == "AMPType.APEX"
     trainer.fit(model)
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    assert trainer.dev_debugger.count_events('AMP') == 20
+    assert trainer.dev_debugger.count_events('AMP') == 10
 
     assert isinstance(trainer.lr_schedulers[0]['scheduler'].optimizer, optim.Adam)
     assert isinstance(trainer.lr_schedulers[1]['scheduler'].optimizer, optim.SGD)
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index c3ac26bdeb01a..e8970c2df4bc5 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -23,7 +23,6 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.simple_models import ClassificationModel
@@ -101,10 +100,12 @@ def test_early_stopping_cpu_model(tmpdir):
 
     class ModelTrainVal(BoringModel):
 
-        def validation_epoch_end(self, outputs) -> None:
-            val_loss = torch.stack([x["x"] for x in outputs]).mean()
-            self.log('val_loss', val_loss)
+        def validation_step(self, *args, **kwargs):
+            output = super().validation_step(*args, **kwargs)
+            self.log('val_loss', output['x'])
+            return output
 
+    tutils.reset_seed()
     stopping = EarlyStopping(monitor="val_loss", min_delta=0.1)
     trainer_options = dict(
         callbacks=[stopping],
@@ -198,13 +199,15 @@ def test_running_test_after_fitting(tmpdir):
 
     class ModelTrainValTest(BoringModel):
 
-        def validation_epoch_end(self, outputs) -> None:
-            val_loss = torch.stack([x["x"] for x in outputs]).mean()
-            self.log('val_loss', val_loss)
+        def validation_step(self, *args, **kwargs):
+            output = super().validation_step(*args, **kwargs)
+            self.log('val_loss', output['x'])
+            return output
 
-        def test_epoch_end(self, outputs) -> None:
-            test_loss = torch.stack([x["y"] for x in outputs]).mean()
-            self.log('test_loss', test_loss)
+        def test_step(self, *args, **kwargs):
+            output = super().test_step(*args, **kwargs)
+            self.log('test_loss', output['y'])
+            return output
 
     model = ModelTrainValTest()
 
@@ -244,9 +247,10 @@ class ModelTrainTest(BoringModel):
         def val_dataloader(self):
             pass
 
-        def test_epoch_end(self, outputs) -> None:
-            test_loss = torch.stack([x["y"] for x in outputs]).mean()
-            self.log('test_loss', test_loss)
+        def test_step(self, *args, **kwargs):
+            output = super().test_step(*args, **kwargs)
+            self.log('test_loss', output['y'])
+            return output
 
     model = ModelTrainTest()
 
@@ -297,15 +301,10 @@ def test_simple_cpu(tmpdir):
 def test_cpu_model(tmpdir):
     """Make sure model trains on CPU."""
     trainer_options = dict(
-        default_root_dir=tmpdir,
-        progress_bar_refresh_rate=0,
-        max_epochs=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4
+        default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=4, limit_val_batches=4
     )
 
-    model = EvalModelTemplate()
-
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
 
 
diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py
index 10cfa0cb9a021..4d04911ffadc9 100644
--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@@ -20,11 +20,11 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 from tests.helpers.utils import reset_seed
 
 
-class ModelWithManualGradTracker(EvalModelTemplate):
+class ModelWithManualGradTracker(BoringModel):
 
     def __init__(self, norm_type, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -36,9 +36,9 @@ def __init__(self, norm_type, *args, **kwargs):
 
     def training_step(self, batch, batch_idx, optimizer_idx=None):
         # just return a loss, no log or progress bar meta
-        x, y = batch
-        loss_val = self.loss(y, self(x.flatten(1, -1)))
-        return {'loss': loss_val}
+        output = self(batch)
+        loss = self.loss(batch, output)
+        return {'loss': loss}
 
     def on_after_backward(self):
         out, norms = {}, []
@@ -102,7 +102,7 @@ def test_grad_tracking_interval(tmpdir, log_every_n_steps):
     )
 
     with patch.object(trainer.logger, "log_metrics") as mocked:
-        model = EvalModelTemplate()
+        model = BoringModel()
         trainer.fit(model)
         expected = trainer.global_step // log_every_n_steps
         grad_norm_dicts = []
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 8e7615baa75f4..969597a10f36d 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -21,14 +21,13 @@
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel, RandomDataset
 
 
 @pytest.mark.parametrize('max_steps', [1, 2, 3])
 def test_on_before_zero_grad_called(tmpdir, max_steps):
 
-    class CurrentTestModel(EvalModelTemplate):
+    class CurrentTestModel(BoringModel):
         on_before_zero_grad_called = 0
 
         def on_before_zero_grad(self, optimizer):
@@ -40,7 +39,6 @@ def on_before_zero_grad(self, optimizer):
         default_root_dir=tmpdir,
         max_steps=max_steps,
         max_epochs=2,
-        num_sanity_val_steps=5,
     )
     assert 0 == model.on_before_zero_grad_called
     trainer.fit(model)
@@ -55,23 +53,24 @@ def test_training_epoch_end_metrics_collection(tmpdir):
     """ Test that progress bar metrics also get collected at the end of an epoch. """
     num_epochs = 3
 
-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):
 
         def training_step(self, *args, **kwargs):
             output = super().training_step(*args, **kwargs)
-            output['progress_bar'].update({'step_metric': torch.tensor(-1)})
-            output['progress_bar'].update({'shared_metric': 100})
+            self.log_dict({'step_metric': torch.tensor(-1), 'shared_metric': 100}, logger=False, prog_bar=True)
             return output
 
         def training_epoch_end(self, outputs):
             epoch = self.current_epoch
             # both scalar tensors and Python numbers are accepted
-            return {
-                'progress_bar': {
-                    f'epoch_metric_{epoch}': torch.tensor(epoch),  # add a new metric key every epoch
-                    'shared_metric': 111,
-                }
-            }
+            self.log_dict(
+                {
+                    f'epoch_metric_{epoch}': torch.tensor(epoch),
+                    'shared_metric': 111
+                },
+                logger=False,
+                prog_bar=True,
+            )
 
     model = CurrentModel()
     trainer = Trainer(
@@ -103,7 +102,7 @@ def on_train_epoch_start(self, trainer, pl_module):
         def on_train_epoch_end(self, trainer, pl_module, outputs):
             self.len_outputs = len(outputs[0])
 
-    class OverriddenModel(EvalModelTemplate):
+    class OverriddenModel(BoringModel):
 
         def on_train_epoch_start(self):
             self.num_train_batches = 0
@@ -114,7 +113,7 @@ def training_epoch_end(self, outputs):  # Overridden
         def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
             self.num_train_batches += 1
 
-    class NotOverriddenModel(EvalModelTemplate):
+    class NotOverriddenModel(BoringModel):
 
         def on_train_epoch_start(self):
             self.num_train_batches = 0
@@ -124,6 +123,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
 
     overridden_model = OverriddenModel()
     not_overridden_model = NotOverriddenModel()
+    not_overridden_model.training_epoch_end = None
 
     callback = LoggingCallback()
     trainer = Trainer(
@@ -152,7 +152,7 @@ def __init__(self, data):
             self.samples = data[0]
             self.targets = data[1]
 
-    class CurrentTestModel(EvalModelTemplate):
+    class CurrentTestModel(BoringModel):
 
         hook_called = False
 
@@ -166,7 +166,7 @@ def transfer_batch_to_device(self, data, device):
             return data
 
     model = CurrentTestModel()
-    batch = CustomBatch((torch.zeros(5, 28), torch.ones(5, 1, dtype=torch.long)))
+    batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long)))
 
     trainer = Trainer(gpus=1)
     trainer.accelerator_backend = GPUAccelerator(trainer)
@@ -226,7 +226,7 @@ def train_dataloader(self):
 @pytest.mark.parametrize('max_epochs,batch_idx_', [(2, 5), (3, 8), (4, 12)])
 def test_on_train_batch_start_hook(max_epochs, batch_idx_):
 
-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):
 
         def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
             if batch_idx == batch_idx_:
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 948fb0144d4d5..19f39b3da4c46 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -30,9 +30,8 @@
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
 from tests.helpers.advanced_models import BasicGAN
-from tests.helpers.boring_model import BoringModel
 
 if _HOROVOD_AVAILABLE:
     import horovod
@@ -173,22 +172,17 @@ def test_horovod_amp(tmpdir):
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_horovod_transfer_batch_to_gpu(tmpdir):
 
-    class TestTrainingStepModel(EvalModelTemplate):
+    class TestTrainingStepModel(BoringModel):
 
         def training_step(self, batch, *args, **kwargs):
-            x, y = batch
-            assert str(x.device) != 'cpu'
-            assert str(y.device) != 'cpu'
+            assert str(batch.device) != 'cpu'
             return super(TestTrainingStepModel, self).training_step(batch, *args, **kwargs)
 
         def validation_step(self, batch, *args, **kwargs):
-            x, y = batch
-            assert str(x.device) != 'cpu'
-            assert str(y.device) != 'cpu'
+            assert str(batch.device) != 'cpu'
             return super(TestTrainingStepModel, self).validation_step(batch, *args, **kwargs)
 
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = TestTrainingStepModel(**hparams)
+    model = TestTrainingStepModel()
 
     trainer_options = dict(
         default_root_dir=str(tmpdir),
@@ -205,7 +199,7 @@ def validation_step(self, batch, *args, **kwargs):
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 def test_horovod_multi_optimizer(tmpdir):
-    model = BasicGAN(**EvalModelTemplate.get_default_hparams())
+    model = BasicGAN()
 
     # fit model
     trainer = Trainer(
@@ -342,8 +336,7 @@ def _compute_batch():
 
 # @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
 # def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir):
-#     hparams = EvalModelTemplate.get_default_hparams()
-#     model = EvalModelTemplate(**hparams)
+#     model = BoringModel()
 #     model.configure_optimizers = model.configure_optimizers__multiple_schedulers
 #
 #     num_workers = 8
diff --git a/tests/models/test_hparams.py b/tests/models/test_hparams.py
index 229c8128aeb88..0e32ebea09d85 100644
--- a/tests/models/test_hparams.py
+++ b/tests/models/test_hparams.py
@@ -21,15 +21,13 @@
 import torch
 from fsspec.implementations.local import LocalFileSystem
 from omegaconf import Container, OmegaConf
-from torch.nn import functional as F
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import LightningModule, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml
 from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, AttributeDict, is_picklable
-from tests.base import EvalModelTemplate
-from tests.helpers import BoringModel, TrialMNIST
+from tests.helpers import BoringModel, RandomDataset
 
 if _HYDRA_EXPERIMENTAL_AVAILABLE:
     from hydra.experimental import compose, initialize
@@ -162,7 +160,7 @@ def test_explicit_args_hparams(tmpdir):
     """
 
     # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, test_arg, test_arg2):
             super().__init__()
@@ -184,7 +182,7 @@ def test_implicit_args_hparams(tmpdir):
     """
 
     # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, test_arg, test_arg2):
             super().__init__()
@@ -206,7 +204,7 @@ def test_explicit_missing_args_hparams(tmpdir):
     """
 
     # define model
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, test_arg, test_arg2):
             super().__init__()
@@ -269,7 +267,14 @@ def test2(self):
     A().test()
 
 
-class SubClassEvalModel(EvalModelTemplate):
+class CustomBoringModel(BoringModel):
+
+    def __init__(self, batch_size=64):
+        super().__init__()
+        self.save_hyperparameters()
+
+
+class SubClassBoringModel(CustomBoringModel):
     any_other_loss = torch.nn.CrossEntropyLoss()
 
     def __init__(self, *args, subclass_arg=1200, **kwargs):
@@ -277,18 +282,18 @@ def __init__(self, *args, subclass_arg=1200, **kwargs):
         self.save_hyperparameters()
 
 
-class SubSubClassEvalModel(SubClassEvalModel):
+class SubSubClassBoringModel(SubClassBoringModel):
     pass
 
 
-class AggSubClassEvalModel(SubClassEvalModel):
+class AggSubClassBoringModel(SubClassBoringModel):
 
     def __init__(self, *args, my_loss=torch.nn.CrossEntropyLoss(), **kwargs):
         super().__init__(*args, **kwargs)
         self.save_hyperparameters()
 
 
-class UnconventionalArgsEvalModel(EvalModelTemplate):
+class UnconventionalArgsBoringModel(CustomBoringModel):
     """ A model that has unconventional names for "self", "*args" and "**kwargs". """
 
     def __init__(obj, *more_args, other_arg=300, **more_kwargs):
@@ -297,7 +302,7 @@ def __init__(obj, *more_args, other_arg=300, **more_kwargs):
         obj.save_hyperparameters()
 
 
-class DictConfSubClassEvalModel(SubClassEvalModel):
+class DictConfSubClassBoringModel(SubClassBoringModel):
 
     def __init__(self, *args, dict_conf=OmegaConf.create(dict(my_param='something')), **kwargs):
         super().__init__(*args, **kwargs)
@@ -306,31 +311,31 @@ def __init__(self, *args, dict_conf=OmegaConf.create(dict(my_param='something'))
 
 @pytest.mark.parametrize(
     "cls", [
-        EvalModelTemplate,
-        SubClassEvalModel,
-        SubSubClassEvalModel,
-        AggSubClassEvalModel,
-        UnconventionalArgsEvalModel,
-        DictConfSubClassEvalModel,
+        CustomBoringModel,
+        SubClassBoringModel,
+        SubSubClassBoringModel,
+        AggSubClassBoringModel,
+        UnconventionalArgsBoringModel,
+        DictConfSubClassBoringModel,
     ]
 )
 def test_collect_init_arguments(tmpdir, cls):
     """ Test that the model automatically saves the arguments passed into the constructor """
     extra_args = {}
-    if cls is AggSubClassEvalModel:
+    if cls is AggSubClassBoringModel:
         extra_args.update(my_loss=torch.nn.CosineEmbeddingLoss())
-    elif cls is DictConfSubClassEvalModel:
+    elif cls is DictConfSubClassBoringModel:
         extra_args.update(dict_conf=OmegaConf.create(dict(my_param='anything')))
 
     model = cls(**extra_args)
-    assert model.hparams.batch_size == 32
+    assert model.hparams.batch_size == 64
     model = cls(batch_size=179, **extra_args)
     assert model.hparams.batch_size == 179
 
-    if isinstance(model, SubClassEvalModel):
+    if isinstance(model, SubClassBoringModel):
         assert model.hparams.subclass_arg == 1200
 
-    if isinstance(model, AggSubClassEvalModel):
+    if isinstance(model, AggSubClassBoringModel):
         assert isinstance(model.hparams.my_loss, torch.nn.CosineEmbeddingLoss)
 
     # verify that the checkpoint saved the correct values
@@ -347,10 +352,10 @@ def test_collect_init_arguments(tmpdir, cls):
     model = cls.load_from_checkpoint(raw_checkpoint_path)
     assert model.hparams.batch_size == 179
 
-    if isinstance(model, AggSubClassEvalModel):
+    if isinstance(model, AggSubClassBoringModel):
         assert isinstance(model.hparams.my_loss, torch.nn.CosineEmbeddingLoss)
 
-    if isinstance(model, DictConfSubClassEvalModel):
+    if isinstance(model, DictConfSubClassBoringModel):
         assert isinstance(model.hparams.dict_conf, Container)
         assert model.hparams.dict_conf['my_param'] == 'anything'
 
@@ -368,7 +373,7 @@ def _raw_checkpoint_path(trainer) -> str:
     return raw_checkpoint_path
 
 
-class LocalVariableModelSuperLast(EvalModelTemplate):
+class LocalVariableModelSuperLast(BoringModel):
     """ This model has the super().__init__() call at the end. """
 
     def __init__(self, arg1, arg2, *args, **kwargs):
@@ -378,7 +383,7 @@ def __init__(self, arg1, arg2, *args, **kwargs):
         super().__init__(*args, **kwargs)  # this is intentionally here at the end
 
 
-class LocalVariableModelSuperFirst(EvalModelTemplate):
+class LocalVariableModelSuperFirst(BoringModel):
     """ This model has the _auto_collect_arguments() call at the end. """
 
     def __init__(self, arg1, arg2, *args, **kwargs):
@@ -429,16 +434,17 @@ def test_collect_init_arguments_with_local_vars(cls):
 #     assert model.hparams.my_arg == 42
 
 
-class AnotherArgModel(EvalModelTemplate):
+class AnotherArgModel(BoringModel):
 
     def __init__(self, arg1):
         super().__init__()
         self.save_hyperparameters(arg1)
 
 
-class OtherArgsModel(EvalModelTemplate):
+class OtherArgsModel(BoringModel):
 
     def __init__(self, arg1, arg2):
+
         super().__init__()
         self.save_hyperparameters(arg1, arg2)
 
@@ -457,7 +463,7 @@ def test_single_config_models_fail(tmpdir, cls, config):
 
 @pytest.mark.parametrize("past_key", ['module_arguments'])
 def test_load_past_checkpoint(tmpdir, past_key):
-    model = EvalModelTemplate()
+    model = CustomBoringModel()
 
     # verify we can train
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1)
@@ -474,7 +480,7 @@ def test_load_past_checkpoint(tmpdir, past_key):
     torch.save(raw_checkpoint, raw_checkpoint_path)
 
     # verify that model loads correctly
-    model2 = EvalModelTemplate.load_from_checkpoint(raw_checkpoint_path)
+    model2 = CustomBoringModel.load_from_checkpoint(raw_checkpoint_path)
     assert model2.hparams.batch_size == -17
 
 
@@ -486,7 +492,7 @@ def test_hparams_pickle(tmpdir):
     assert ad == pickle.loads(pkl)
 
 
-class UnpickleableArgsEvalModel(EvalModelTemplate):
+class UnpickleableArgsBoringModel(BoringModel):
     """ A model that has an attribute that cannot be pickled. """
 
     def __init__(self, foo='bar', pickle_me=(lambda x: x + 1), **kwargs):
@@ -496,7 +502,7 @@ def __init__(self, foo='bar', pickle_me=(lambda x: x + 1), **kwargs):
 
 
 def test_hparams_pickle_warning(tmpdir):
-    model = UnpickleableArgsEvalModel()
+    model = UnpickleableArgsBoringModel()
     trainer = Trainer(default_root_dir=tmpdir, max_steps=1)
     with pytest.warns(UserWarning, match="attribute 'pickle_me' removed from hparams because it cannot be pickled"):
         trainer.fit(model)
@@ -522,38 +528,15 @@ def test_hparams_save_yaml(tmpdir):
     assert load_hparams_from_yaml(path_yaml) == hparams
 
 
-class NoArgsSubClassEvalModel(EvalModelTemplate):
-
-    def __init__(self):
-        super().__init__()
-
-
-class SimpleNoArgsModel(LightningModule):
+class NoArgsSubClassBoringModel(CustomBoringModel):
 
     def __init__(self):
         super().__init__()
-        self.l1 = torch.nn.Linear(28 * 28, 10)
-
-    def forward(self, x):
-        return torch.relu(self.l1(x.view(x.size(0), -1)))
-
-    def training_step(self, batch, batch_nb):
-        x, y = batch
-        loss = F.cross_entropy(self(x), y)
-        return {'loss': loss, 'log': {'train_loss': loss}}
-
-    def test_step(self, batch, batch_nb):
-        x, y = batch
-        loss = F.cross_entropy(self(x), y)
-        return {'loss': loss, 'log': {'train_loss': loss}}
-
-    def configure_optimizers(self):
-        return torch.optim.Adam(self.parameters(), lr=0.02)
 
 
 @pytest.mark.parametrize("cls", [
-    SimpleNoArgsModel,
-    NoArgsSubClassEvalModel,
+    BoringModel,
+    NoArgsSubClassBoringModel,
 ])
 def test_model_nohparams_train_test(tmpdir, cls):
     """Test models that do not tae any argument in init."""
@@ -564,20 +547,20 @@ def test_model_nohparams_train_test(tmpdir, cls):
         default_root_dir=tmpdir,
     )
 
-    train_loader = DataLoader(TrialMNIST(os.getcwd(), train=True, download=True), batch_size=32)
+    train_loader = DataLoader(RandomDataset(32, 64), batch_size=32)
     trainer.fit(model, train_loader)
 
-    test_loader = DataLoader(TrialMNIST(os.getcwd(), train=False, download=True), batch_size=32)
+    test_loader = DataLoader(RandomDataset(32, 64), batch_size=32)
     trainer.test(test_dataloaders=test_loader)
 
 
 def test_model_ignores_non_exist_kwargument(tmpdir):
     """Test that the model takes only valid class arguments."""
 
-    class LocalModel(EvalModelTemplate):
+    class LocalModel(BoringModel):
 
         def __init__(self, batch_size=15):
-            super().__init__(batch_size=batch_size)
+            super().__init__()
             self.save_hyperparameters()
 
     model = LocalModel()
@@ -593,11 +576,11 @@ def __init__(self, batch_size=15):
     assert 'non_exist_kwarg' not in model.hparams
 
 
-class SuperClassPositionalArgs(EvalModelTemplate):
+class SuperClassPositionalArgs(BoringModel):
 
     def __init__(self, hparams):
         super().__init__()
-        self._hparams = None  # pretend EvalModelTemplate did not call self.save_hyperparameters()
+        self._hparams = None  # pretend BoringModel did not call self.save_hyperparameters()
         self.hparams = hparams
 
 
diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index e0314943613eb..2bd3ebf9b6e87 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -21,14 +21,13 @@
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from tests.base import EvalModelTemplate
 from tests.helpers import BoringModel
 
 
 def test_model_saves_with_input_sample(tmpdir):
     """Test that ONNX model saves with input sample and size is greater than 3 MB"""
     model = BoringModel()
-    trainer = Trainer(max_epochs=1)
+    trainer = Trainer(fast_dev_run=True)
     trainer.fit(model)
 
     file_path = os.path.join(tmpdir, "model.onnx")
@@ -42,7 +41,7 @@ def test_model_saves_with_input_sample(tmpdir):
 def test_model_saves_on_gpu(tmpdir):
     """Test that model saves on gpu"""
     model = BoringModel()
-    trainer = Trainer(gpus=1, max_epochs=1)
+    trainer = Trainer(gpus=1, fast_dev_run=True)
     trainer.fit(model)
 
     file_path = os.path.join(tmpdir, "model.onnx")
@@ -55,7 +54,7 @@ def test_model_saves_on_gpu(tmpdir):
 def test_model_saves_with_example_output(tmpdir):
     """Test that ONNX model saves when provided with example output"""
     model = BoringModel()
-    trainer = Trainer(max_epochs=1)
+    trainer = Trainer(fast_dev_run=True)
     trainer.fit(model)
 
     file_path = os.path.join(tmpdir, "model.onnx")
@@ -92,9 +91,10 @@ def test_model_saves_on_multi_gpu(tmpdir):
         progress_bar_refresh_rate=0,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
+    model.example_input_array = torch.randn(5, 32)
 
-    tpipes.run_model_test(trainer_options, model)
+    tpipes.run_model_test(trainer_options, model, min_acc=0.08)
 
     file_path = os.path.join(tmpdir, "model.onnx")
     model.to_onnx(file_path)
@@ -130,7 +130,7 @@ def test_if_inference_output_is_valid(tmpdir):
     model = BoringModel()
     model.example_input_array = torch.randn(5, 32)
 
-    trainer = Trainer(max_epochs=2)
+    trainer = Trainer(fast_dev_run=True)
     trainer.fit(model)
 
     model.eval()
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 9420da74f0e48..114ebf33681dc 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -16,18 +16,21 @@
 import os
 import pickle
 from copy import deepcopy
+from typing import Generic, TypeVar
 
 import cloudpickle
 import pytest
 import torch
+import torch.nn.functional as F
 
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import RunningStage, TrainerState
-from tests.base import EvalModelTemplate, GenericEvalModelTemplate
 from tests.helpers import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -52,14 +55,48 @@ def on_train_end(self, trainer, pl_module):
         self._check_properties(trainer, pl_module)
 
 
+class ValTestLossBoringModel(BoringModel):
+
+    def __init__(self, batch_size=4):
+        super().__init__()
+        self.save_hyperparameters()
+
+    def validation_step(self, batch, batch_idx):
+        out = super().validation_step(batch, batch_idx)
+        self.log('val_loss', out['x'])
+        return out
+
+    def test_step(self, batch, batch_idx):
+        out = super().test_step(batch, batch_idx)
+        self.log('test_loss', out['y'])
+        return out
+
+
+T = TypeVar('T')
+
+
+class GenericParentValTestLossBoringModel(Generic[T], ValTestLossBoringModel):
+
+    def __init__(self, batch_size: int = 4):
+        super().__init__(batch_size=batch_size)
+
+
+class GenericValTestLossBoringModel(GenericParentValTestLossBoringModel[int]):
+    pass
+
+
 def test_model_properties_resume_from_checkpoint(tmpdir):
-    """ Test that properties like `current_epoch` and `global_step`
-    in model and trainer are always the same. """
-    model = EvalModelTemplate()
-    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    """
+    Test that properties like `current_epoch` and `global_step`
+    in model and trainer are always the same.
+    """
+    model = BoringModel()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     trainer_args = dict(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=False,
         callbacks=[checkpoint_callback, ModelTrainerPropertyParity()],  # this performs the assertions
     )
@@ -73,18 +110,19 @@ def test_model_properties_resume_from_checkpoint(tmpdir):
 
 def test_try_resume_from_non_existing_checkpoint(tmpdir):
     """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
-    model = BoringModel()
-    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    dm = ClassifDataModule()
+    model = ClassificationModel()
+    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
         logger=False,
         callbacks=[checkpoint_cb],
-        limit_train_batches=0.1,
-        limit_val_batches=0.1,
+        limit_train_batches=2,
+        limit_val_batches=2,
     )
     # Generate checkpoint `last.ckpt` with BoringModel
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
     # `True` if resume/restore successfully else `False`
     assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
     assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
@@ -99,11 +137,12 @@ def on_train_start(self, trainer, pl_module):
 
 def test_callbacks_state_resume_from_checkpoint(tmpdir):
     """ Test that resuming from a checkpoint restores callbacks that persist state. """
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
     callback_capture = CaptureCallbacksBeforeTraining()
 
     def get_trainer_args():
-        checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+        checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
         trainer_args = dict(
             default_root_dir=tmpdir, max_steps=1, logger=False, callbacks=[
                 checkpoint,
@@ -116,12 +155,12 @@ def get_trainer_args():
 
     # initial training
     trainer = Trainer(**get_trainer_args())
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
     callbacks_before_resume = deepcopy(trainer.callbacks)
 
     # resumed training
     trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt"))
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     assert len(callbacks_before_resume) == len(callback_capture.callbacks)
 
@@ -133,23 +172,24 @@ def get_trainer_args():
 
 def test_callbacks_references_resume_from_checkpoint(tmpdir):
     """ Test that resuming from a checkpoint sets references as expected. """
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
     args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False}
 
     # initial training
-    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     trainer = Trainer(**args, callbacks=[checkpoint])
     assert checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # resumed training
-    new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="val_loss", save_last=True)
     # pass in a new checkpoint object, which should take
     # precedence over the one in the last.ckpt file
     trainer = Trainer(**args, callbacks=[new_checkpoint], resume_from_checkpoint=str(tmpdir / "last.ckpt"))
     assert checkpoint is not new_checkpoint
     assert new_checkpoint is trainer.callbacks[-1] is trainer.checkpoint_callback
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -158,7 +198,30 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 
     tutils.set_random_master_port()
 
-    model = EvalModelTemplate()
+    class CustomClassificationModelDP(ClassificationModel):
+
+        def _step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            return {'logits': logits, 'y': y}
+
+        def training_step(self, batch, batch_idx):
+            _, y = batch
+            out = self._step(batch, batch_idx)
+            out['loss'] = F.cross_entropy(out['logits'], y)
+            return out
+
+        def validation_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def test_step(self, batch, batch_idx):
+            return self._step(batch, batch_idx)
+
+        def validation_step_end(self, outputs):
+            self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
+
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -169,8 +232,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
     trainer_options = dict(
         progress_bar_refresh_rate=0,
         max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=5,
+        limit_val_batches=5,
         callbacks=[checkpoint],
         logger=logger,
         gpus=[0, 1],
@@ -180,21 +243,17 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     # run test set
     new_trainer = Trainer(**trainer_options)
-    results = new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model)
     pretrained_model.cpu()
 
-    # test we have good test accuracy
-    acc = results[0]['test_acc']
-    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
-
     dataloaders = model.test_dataloader()
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
@@ -207,8 +266,8 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     """Verify `test()` on pretrained model."""
     tutils.set_random_master_port()
-
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -219,8 +278,8 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     trainer_options = dict(
         progress_bar_refresh_rate=0,
         max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=2,
+        limit_val_batches=2,
         callbacks=[checkpoint],
         logger=logger,
         gpus=[0, 1],
@@ -230,33 +289,32 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     log.info(os.listdir(tutils.get_data_path(logger, path_dir=tmpdir)))
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     # run test set
     new_trainer = Trainer(**trainer_options)
-    results = new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model)
     pretrained_model.cpu()
 
-    acc = results[0]['test_acc']
-    assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
-
-    dataloaders = model.test_dataloader()
+    dataloaders = dm.test_dataloader()
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
 
     for dataloader in dataloaders:
-        tpipes.run_prediction(pretrained_model, dataloader)
+        tpipes.run_prediction(pretrained_model, dataloader, min_acc=0.1)
 
 
 def test_running_test_pretrained_model_cpu(tmpdir):
     """Verify test() on pretrained model."""
-    model = EvalModelTemplate()
+    tutils.reset_seed()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -266,9 +324,10 @@ def test_running_test_pretrained_model_cpu(tmpdir):
 
     trainer_options = dict(
         progress_bar_refresh_rate=0,
-        max_epochs=3,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        max_epochs=2,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
         callbacks=[checkpoint],
         logger=logger,
         default_root_dir=tmpdir,
@@ -276,31 +335,32 @@ def test_running_test_pretrained_model_cpu(tmpdir):
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
-    pretrained_model = EvalModelTemplate.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
+    pretrained_model = ClassificationModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
 
     new_trainer = Trainer(**trainer_options)
-    new_trainer.test(pretrained_model)
+    new_trainer.test(pretrained_model, datamodule=dm)
 
     # test we have good test accuracy
-    tutils.assert_ok_model_acc(new_trainer)
+    tutils.assert_ok_model_acc(new_trainer, key='test_acc', thr=0.45)
 
 
-@pytest.mark.parametrize('model_template', [EvalModelTemplate, GenericEvalModelTemplate])
+@pytest.mark.parametrize('model_template', [ValTestLossBoringModel, GenericValTestLossBoringModel])
 def test_load_model_from_checkpoint(tmpdir, model_template):
     """Verify test() on pretrained model."""
-    hparams = model_template.get_default_hparams()
-    model = model_template(**hparams)
+    tutils.reset_seed()
+    model = model_template()
 
     trainer_options = dict(
         progress_bar_refresh_rate=0,
         max_epochs=2,
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
-        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='early_stop_on', save_top_k=-1)],
+        limit_train_batches=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='val_loss', save_top_k=-1)],
         default_root_dir=tmpdir,
     )
 
@@ -315,7 +375,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     # load last checkpoint
     last_checkpoint = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))[-1]
 
-    # Since `EvalModelTemplate` has `_save_hparams = True` by default, check that ckpt has hparams
+    # Since `BoringModel` has `_save_hparams = True` by default, check that ckpt has hparams
     ckpt = torch.load(last_checkpoint)
     assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'hyper_parameters missing from checkpoints'
 
@@ -323,8 +383,8 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     pretrained_model = model_template.load_from_checkpoint(last_checkpoint)
 
     # test that hparams loaded correctly
-    for k, v in hparams.items():
-        assert getattr(pretrained_model, k) == v
+    for k, v in model.hparams.items():
+        assert getattr(pretrained_model.hparams, k) == v
 
     # assert weights are the same
     for (old_name, old_p), (new_name, new_p) in zip(model.named_parameters(), pretrained_model.named_parameters()):
@@ -334,15 +394,11 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
     new_trainer = Trainer(**trainer_options)
     new_trainer.test(pretrained_model)
 
-    # test we have good test accuracy
-    tutils.assert_ok_model_acc(new_trainer)
-
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_dp_resume(tmpdir):
     """Make sure DP continues training correctly."""
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    model = BoringModel()
 
     trainer_options = dict(max_epochs=1, gpus=2, accelerator='dp', default_root_dir=tmpdir)
 
@@ -355,7 +411,7 @@ def test_dp_resume(tmpdir):
 
     # add these to the trainer options
     trainer_options['logger'] = logger
-    trainer_options['checkpoint_callback'] = checkpoint
+    trainer_options['callbacks'] = [checkpoint]
 
     # fit model
     trainer = Trainer(**trainer_options)
@@ -377,31 +433,38 @@ def test_dp_resume(tmpdir):
     # init new trainer
     new_logger = tutils.get_default_logger(tmpdir, version=logger.version)
     trainer_options['logger'] = new_logger
-    trainer_options['checkpoint_callback'] = ModelCheckpoint(dirpath=tmpdir)
+    trainer_options['callbacks'] = [ModelCheckpoint(dirpath=tmpdir)]
     trainer_options['limit_train_batches'] = 0.5
     trainer_options['limit_val_batches'] = 0.2
     trainer_options['max_epochs'] = 1
     new_trainer = Trainer(**trainer_options)
 
-    # set the epoch start hook so we can predict before the model does the full training
-    def assert_good_acc():
-        assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0
+    class CustomModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.on_train_start_called = False
+
+        # set the epoch start hook so we can predict before the model does the full training
+        def on_train_start(self):
+            assert self.trainer.current_epoch == real_global_epoch and self.trainer.current_epoch > 0
 
-        # if model and state loaded correctly, predictions will be good even though we
-        # haven't trained with the new loaded model
-        dp_model = new_trainer.model
-        dp_model.eval()
-        dp_model.module.module.running_stage = RunningStage.EVALUATING
+            # if model and state loaded correctly, predictions will be good even though we
+            # haven't trained with the new loaded model
+            dp_model = new_trainer.model
+            dp_model.eval()
+            dp_model.module.module.running_stage = RunningStage.EVALUATING
 
-        dataloader = trainer.train_dataloader
-        tpipes.run_prediction(dp_model, dataloader, dp=True)
+            dataloader = self.train_dataloader()
+            tpipes.run_prediction(self.trainer.get_model(), dataloader)
+            self.on_train_start_called = True
 
     # new model
-    model = EvalModelTemplate(**hparams)
-    model.on_train_start = assert_good_acc
+    model = CustomModel()
 
     # fit new model which should load hpc weights
     new_trainer.fit(model)
+    assert model.on_train_start_called
 
     # test freeze on gpu
     model.freeze()
@@ -410,7 +473,7 @@ def assert_good_acc():
 
 def test_model_saving_loading(tmpdir):
     """Tests use case where trainer saves the model, and user loads it from tags independently."""
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -418,6 +481,8 @@ def test_model_saving_loading(tmpdir):
     # fit model
     trainer = Trainer(
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
         default_root_dir=tmpdir,
@@ -432,16 +497,11 @@ def test_model_saving_loading(tmpdir):
     if not isinstance(dataloaders, list):
         dataloaders = [dataloaders]
 
-    for dataloader in dataloaders:
-        for batch in dataloader:
-            break
-
-    x, y = batch
-    x = x.view(x.size(0), -1)
+    batch = next(iter(dataloaders[0]))
 
     # generate preds before saving model
     model.eval()
-    pred_before_saving = model(x)
+    pred_before_saving = model(batch)
 
     # save model
     new_weights_path = os.path.join(tmpdir, 'save_test.ckpt')
@@ -450,7 +510,7 @@ def test_model_saving_loading(tmpdir):
     # load new model
     hparams_path = tutils.get_data_path(logger, path_dir=tmpdir)
     hparams_path = os.path.join(hparams_path, 'hparams.yaml')
-    model_2 = EvalModelTemplate.load_from_checkpoint(
+    model_2 = BoringModel.load_from_checkpoint(
         checkpoint_path=new_weights_path,
         hparams_file=hparams_path,
     )
@@ -458,7 +518,7 @@ def test_model_saving_loading(tmpdir):
 
     # make prediction
     # assert that both predictions are the same
-    new_pred = model_2(x)
+    new_pred = model_2(batch)
     assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
 
 
@@ -468,9 +528,9 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
     # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
     monkeypatch.setenv('TORCH_HOME', tmpdir)
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     # Extra layer
-    model.c_d3 = torch.nn.Linear(model.hidden_dim, model.hidden_dim)
+    model.c_d3 = torch.nn.Linear(32, 32)
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -479,6 +539,8 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
     )
@@ -496,14 +558,14 @@ def test_strict_model_load_more_params(monkeypatch, tmpdir, tmpdir_server, url_c
     hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
     ckpt_path = hparams_url if url_ckpt else new_weights_path
 
-    EvalModelTemplate.load_from_checkpoint(
+    BoringModel.load_from_checkpoint(
         checkpoint_path=ckpt_path,
         hparams_file=hparams_path,
         strict=False,
     )
 
     with pytest.raises(RuntimeError, match=r'Unexpected key\(s\) in state_dict: "c_d3.weight", "c_d3.bias"'):
-        EvalModelTemplate.load_from_checkpoint(
+        BoringModel.load_from_checkpoint(
             checkpoint_path=ckpt_path,
             hparams_file=hparams_path,
             strict=True,
@@ -516,7 +578,7 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
     # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
     monkeypatch.setenv('TORCH_HOME', tmpdir)
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)
@@ -525,6 +587,8 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
         logger=logger,
         callbacks=[ModelCheckpoint(dirpath=tmpdir)],
     )
@@ -542,7 +606,7 @@ def test_strict_model_load_less_params(monkeypatch, tmpdir, tmpdir_server, url_c
     hparams_url = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}'
     ckpt_path = hparams_url if url_ckpt else new_weights_path
 
-    class CurrentModel(EvalModelTemplate):
+    class CurrentModel(BoringModel):
 
         def __init__(self):
             super().__init__()
@@ -563,6 +627,6 @@ def __init__(self):
 
 
 def test_model_pickle(tmpdir):
-    model = EvalModelTemplate()
+    model = BoringModel()
     pickle.dumps(model)
     cloudpickle.dumps(model)
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 98a02d730ec9e..e5895d98b6fcb 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -19,14 +19,14 @@
 from torch.utils.data import DataLoader
 
 import tests.helpers.pipelines as tpipes
+import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import TPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
-from tests.helpers.datasets import TrialMNIST
+from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.utils import pl_multi_process_test
 
 if _TPU_AVAILABLE:
@@ -34,7 +34,7 @@
     import torch_xla.distributed.xla_multiprocessing as xmp
     SERIAL_EXEC = xmp.MpSerialExecutor()
 
-_LARGER_DATASET = TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8))
+_LARGER_DATASET = RandomDataset(32, 2000)
 
 
 # 8 cores needs a big dataset
@@ -42,20 +42,30 @@ def _serial_train_loader():
     return DataLoader(_LARGER_DATASET, batch_size=32)
 
 
+class SerialLoaderBoringModel(BoringModel):
+
+    def train_dataloader(self):
+        return DataLoader(RandomDataset(32, 2000), batch_size=32)
+
+    def val_dataloader(self):
+        return DataLoader(RandomDataset(32, 2000), batch_size=32)
+
+
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
@@ -64,16 +74,17 @@ def test_model_tpu_cores_1(tmpdir):
 @pl_multi_process_test
 def test_model_tpu_index(tmpdir, tpu_core):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=[tpu_core],
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
     assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
 
@@ -82,6 +93,7 @@ def test_model_tpu_index(tmpdir, tpu_core):
 @pl_multi_process_test
 def test_model_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
@@ -91,29 +103,27 @@ def test_model_tpu_cores_8(tmpdir):
         limit_val_batches=0.4,
     )
 
-    model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = _serial_train_loader
-    model.val_dataloader = _serial_train_loader
-
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    model = SerialLoaderBoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
 
 
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_16bit_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
@@ -123,17 +133,18 @@ def test_model_16bit_tpu_cores_1(tmpdir):
 @pl_multi_process_test
 def test_model_16bit_tpu_index(tmpdir, tpu_core):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=[tpu_core],
-        limit_train_batches=0.4,
-        limit_val_batches=0.2,
+        limit_train_batches=4,
+        limit_val_batches=2,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
     assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
@@ -143,6 +154,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core):
 @pl_multi_process_test
 def test_model_16bit_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
@@ -153,26 +165,32 @@ def test_model_16bit_tpu_cores_8(tmpdir):
         limit_val_batches=0.4,
     )
 
-    model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = _serial_train_loader
-    model.val_dataloader = _serial_train_loader
-
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    model = SerialLoaderBoringModel()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False, min_acc=0.05)
 
 
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_tpu_early_stop(tmpdir):
     """Test if single TPU core training works"""
-    model = EvalModelTemplate()
+
+    class CustomBoringModel(BoringModel):
+
+        def validation_step(self, *args, **kwargs):
+            out = super().validation_step(*args, **kwargs)
+            self.log('val_loss', out['x'])
+            return out
+
+    tutils.reset_seed()
+    model = CustomBoringModel()
     trainer = Trainer(
-        callbacks=[EarlyStopping()],
+        callbacks=[EarlyStopping(monitor='val_loss')],
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=50,
-        limit_train_batches=10,
-        limit_val_batches=10,
+        limit_train_batches=4,
+        limit_val_batches=4,
         tpu_cores=1,
     )
     trainer.fit(model)
@@ -182,6 +200,7 @@ def test_model_tpu_early_stop(tmpdir):
 @pl_multi_process_test
 def test_tpu_grad_norm(tmpdir):
     """Test if grad_norm works on TPU."""
+    tutils.reset_seed()
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
@@ -192,7 +211,7 @@ def test_tpu_grad_norm(tmpdir):
         gradient_clip_val=0.1,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
@@ -201,7 +220,8 @@ def test_tpu_grad_norm(tmpdir):
 def test_dataloaders_passed_to_fit(tmpdir):
     """Test if dataloaders passed to trainer works on TPU"""
 
-    model = EvalModelTemplate()
+    tutils.reset_seed()
+    model = BoringModel()
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8)
     trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())

From cf30b956a2c2cc0bfe5dfa8df732e4947a75754f Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Thu, 11 Feb 2021 20:02:11 +0530
Subject: [PATCH 07/34] update example (#5753)

---
 pytorch_lightning/core/hooks.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 11a86c2251705..23fd5d9b58755 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -305,8 +305,7 @@ def on_after_backward(self) -> None:
             def on_after_backward(self):
                 # example to inspect gradient information in tensorboard
                 if self.trainer.global_step % 25 == 0:  # don't make the tf file huge
-                    params = self.state_dict()
-                    for k, v in params.items():
+                    for k, v in self.named_parameters():
                         self.logger.experiment.add_histogram(
                             tag=k, values=v.grad, global_step=self.trainer.global_step
                         )

From 0c80b9f890b575499f7960d768da40dcccbf0f83 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 11 Feb 2021 15:32:12 +0100
Subject: [PATCH 08/34] fix metric docs (#5880)

---
 .../metrics/classification/auroc.py           |  6 ++--
 .../classification/average_precision.py       |  6 ++--
 .../classification/confusion_matrix.py        | 12 ++++----
 .../metrics/classification/f_beta.py          | 14 +++++-----
 .../metrics/classification/iou.py             |  8 +++---
 .../classification/precision_recall_curve.py  |  6 ++--
 .../metrics/classification/roc.py             |  6 ++--
 pytorch_lightning/metrics/functional/auroc.py |  2 +-
 .../metrics/functional/average_precision.py   |  2 ++
 .../metrics/functional/confusion_matrix.py    | 16 +++++------
 .../metrics/functional/f_beta.py              | 28 +++++++++----------
 pytorch_lightning/metrics/functional/iou.py   | 10 +++----
 .../functional/precision_recall_curve.py      |  2 ++
 pytorch_lightning/metrics/functional/roc.py   |  2 ++
 14 files changed, 63 insertions(+), 57 deletions(-)

diff --git a/pytorch_lightning/metrics/classification/auroc.py b/pytorch_lightning/metrics/classification/auroc.py
index a755e2bbb89cd..ece2452938b5b 100644
--- a/pytorch_lightning/metrics/classification/auroc.py
+++ b/pytorch_lightning/metrics/classification/auroc.py
@@ -29,10 +29,10 @@ class AUROC(Metric):
 
     Forward accepts
 
-    - ``preds`` (float tensor): ``(N, )`` (binary) or ``(N, C, ...)`` (multilabel/multiclass)
-      where C is the number of classes
+    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass) tensor
+      with probabilities, where C is the number of classes.
 
-    - ``target`` (long tensor): ``(N, ...)`` or ``(N, C, ...)``
+    - ``target`` (long tensor): ``(N, ...)`` or ``(N, C, ...)`` with integer labels
 
     For non-binary input, if the ``preds`` and ``target`` tensor have the same
     size the input will be interpretated as multilabel and if ``preds`` have one
diff --git a/pytorch_lightning/metrics/classification/average_precision.py b/pytorch_lightning/metrics/classification/average_precision.py
index f6678ddd4ae75..f9c7bde158383 100644
--- a/pytorch_lightning/metrics/classification/average_precision.py
+++ b/pytorch_lightning/metrics/classification/average_precision.py
@@ -28,10 +28,10 @@ class AveragePrecision(Metric):
 
     Forward accepts
 
-    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass)
-      where C is the number of classes
+    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass) tensor
+      with probabilities, where C is the number of classes.
 
-    - ``target`` (long tensor): ``(N, ...)``
+    - ``target`` (long tensor): ``(N, ...)`` with integer labels
 
     Args:
         num_classes: integer with number of classes. Not nessesary to provide
diff --git a/pytorch_lightning/metrics/classification/confusion_matrix.py b/pytorch_lightning/metrics/classification/confusion_matrix.py
index 77933ab9ba56f..c3defc82bc92d 100644
--- a/pytorch_lightning/metrics/classification/confusion_matrix.py
+++ b/pytorch_lightning/metrics/classification/confusion_matrix.py
@@ -23,7 +23,7 @@ class ConfusionMatrix(Metric):
     """
     Computes the `confusion matrix
     <https://scikit-learn.org/stable/modules/model_evaluation.html#confusion-matrix>`_.  Works with binary,
-    multiclass, and multilabel data.  Accepts logits from a model output or
+    multiclass, and multilabel data.  Accepts probabilities from a model output or
     integer class values in prediction.  Works with multi-dimensional preds and
     target.
 
@@ -35,8 +35,8 @@ class ConfusionMatrix(Metric):
     - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
     - ``target`` (long tensor): ``(N, ...)``
 
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
 
@@ -44,13 +44,13 @@ class ConfusionMatrix(Metric):
         num_classes: Number of classes in the dataset.
         normalize: Normalization mode for confusion matrix. Choose from
 
-            - ``None``: no normalization (default)
+            - ``None`` or ``'none'``: no normalization (default)
             - ``'true'``: normalization over the targets (most commonly used)
             - ``'pred'``: normalization over the predictions
             - ``'all'``: normalization over the whole matrix
 
         threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
+            Threshold value for binary or multi-label probabilites. default: 0.5
         compute_on_step:
             Forward only calls ``update()`` and return None if this is set to False. default: True
         dist_sync_on_step:
@@ -90,7 +90,7 @@ def __init__(
         self.normalize = normalize
         self.threshold = threshold
 
-        allowed_normalize = ('true', 'pred', 'all', None)
+        allowed_normalize = ('true', 'pred', 'all', 'none', None)
         assert self.normalize in allowed_normalize, \
             f"Argument average needs to one of the following: {allowed_normalize}"
 
diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py
index 6d5fd6e78ed27..9a580e02cf8ae 100755
--- a/pytorch_lightning/metrics/classification/f_beta.py
+++ b/pytorch_lightning/metrics/classification/f_beta.py
@@ -29,7 +29,7 @@ class FBeta(Metric):
         {(\beta^2 * \text{precision}) + \text{recall}}
 
     Where :math:`\beta` is some positive real factor. Works with binary, multiclass, and multilabel data.
-    Accepts logits from a model output or integer class values in prediction.
+    Accepts probabilities from a model output or integer class values in prediction.
     Works with multi-dimensional preds and target.
 
     Forward accepts
@@ -37,8 +37,8 @@ class FBeta(Metric):
     - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
     - ``target`` (long tensor): ``(N, ...)``
 
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
 
@@ -46,14 +46,14 @@ class FBeta(Metric):
         num_classes: Number of classes in the dataset.
         beta: Beta coefficient in the F measure.
         threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
+            Threshold value for binary or multi-label probabilities. default: 0.5
 
         average:
             - ``'micro'`` computes metric globally
             - ``'macro'`` computes metric for each class and uniformly averages them
             - ``'weighted'`` computes metric for each class and does a weighted-average,
               where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` computes and returns the metric per class
+            - ``'none'`` or ``None`` computes and returns the metric per class
 
         multilabel: If predictions are from multilabel classification.
         compute_on_step:
@@ -98,7 +98,7 @@ def __init__(
         self.average = average
         self.multilabel = multilabel
 
-        allowed_average = ("micro", "macro", "weighted", None)
+        allowed_average = ("micro", "macro", "weighted", "none", None)
         if self.average not in allowed_average:
             raise ValueError(
                 'Argument `average` expected to be one of the following:'
@@ -163,7 +163,7 @@ class F1(FBeta):
             - ``'macro'`` computes metric for each class and uniformly averages them
             - ``'weighted'`` computes metric for each class and does a weighted-average,
               where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` computes and returns the metric per class
+            - ``'none'`` or ``None`` computes and returns the metric per class
 
         multilabel: If predictions are from multilabel classification.
         compute_on_step:
diff --git a/pytorch_lightning/metrics/classification/iou.py b/pytorch_lightning/metrics/classification/iou.py
index 40567a40c807a..a261b767a8190 100644
--- a/pytorch_lightning/metrics/classification/iou.py
+++ b/pytorch_lightning/metrics/classification/iou.py
@@ -29,7 +29,7 @@ class IoU(ConfusionMatrix):
     They may be subject to conversion from input data (see description below). Note that it is different from box IoU.
 
     Works with binary, multiclass and multi-label data.
-    Accepts logits from a model output or integer class values in prediction.
+    Accepts probabilities from a model output or integer class values in prediction.
     Works with multi-dimensional preds and target.
 
     Forward accepts
@@ -37,8 +37,8 @@ class IoU(ConfusionMatrix):
     - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes
     - ``target`` (long tensor): ``(N, ...)``
 
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
 
@@ -51,7 +51,7 @@ class IoU(ConfusionMatrix):
             `pred` AND no instances of the class index were present in `target`. For example, if we have 3 classes,
             [0, 0] for `pred`, and [0, 2] for `target`, then class 1 would be assigned the `absent_score`.
         threshold:
-            Threshold value for binary or multi-label logits.
+            Threshold value for binary or multi-label probabilities.
         reduction: a method to reduce metric score over labels.
 
             - ``'elementwise_mean'``: takes the mean (default)
diff --git a/pytorch_lightning/metrics/classification/precision_recall_curve.py b/pytorch_lightning/metrics/classification/precision_recall_curve.py
index 4f81c7283e202..9c6c4421cbb7c 100644
--- a/pytorch_lightning/metrics/classification/precision_recall_curve.py
+++ b/pytorch_lightning/metrics/classification/precision_recall_curve.py
@@ -31,10 +31,10 @@ class PrecisionRecallCurve(Metric):
 
     Forward accepts
 
-    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass)
-      where C is the number of classes
+    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass) tensor
+      with probabilities, where C is the number of classes.
 
-    - ``target`` (long tensor): ``(N, ...)``
+    - ``target`` (long tensor): ``(N, ...)`` or ``(N, C, ...)`` with integer labels
 
     Args:
         num_classes: integer with number of classes. Not nessesary to provide
diff --git a/pytorch_lightning/metrics/classification/roc.py b/pytorch_lightning/metrics/classification/roc.py
index a5ff459f67be1..9452d59fb9e76 100644
--- a/pytorch_lightning/metrics/classification/roc.py
+++ b/pytorch_lightning/metrics/classification/roc.py
@@ -28,10 +28,10 @@ class ROC(Metric):
 
     Forward accepts
 
-    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass)
-      where C is the number of classes
+    - ``preds`` (float tensor): ``(N, ...)`` (binary) or ``(N, C, ...)`` (multiclass) tensor
+      with probabilities, where C is the number of classes.
 
-    - ``target`` (long tensor): ``(N, ...)``
+    - ``target`` (long tensor): ``(N, ...)`` or ``(N, C, ...)`` with integer labels
 
     Args:
         num_classes: integer with number of classes. Not nessesary to provide
diff --git a/pytorch_lightning/metrics/functional/auroc.py b/pytorch_lightning/metrics/functional/auroc.py
index 29f5081295434..816bbde7c6a79 100644
--- a/pytorch_lightning/metrics/functional/auroc.py
+++ b/pytorch_lightning/metrics/functional/auroc.py
@@ -147,7 +147,7 @@ def auroc(
     <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Further_interpretations>`_
 
     Args:
-        preds: Predictions from model (probabilities)
+        preds: predictions from model (logits or probabilities)
         target: Ground truth labels
         num_classes: integer with number of classes. Not nessesary to provide
             for binary problems.
diff --git a/pytorch_lightning/metrics/functional/average_precision.py b/pytorch_lightning/metrics/functional/average_precision.py
index 49dc6fed9cec6..19d496dd57dcc 100644
--- a/pytorch_lightning/metrics/functional/average_precision.py
+++ b/pytorch_lightning/metrics/functional/average_precision.py
@@ -61,6 +61,8 @@ def average_precision(
     Computes the average precision score.
 
     Args:
+        preds: predictions from model (logits or probabilities)
+        target: ground truth values
         num_classes: integer with number of classes. Not nessesary to provide
             for binary problems.
         pos_label: integer determining the positive class. Default is ``None``
diff --git a/pytorch_lightning/metrics/functional/confusion_matrix.py b/pytorch_lightning/metrics/functional/confusion_matrix.py
index a55619dd04891..58947f2cb19ed 100644
--- a/pytorch_lightning/metrics/functional/confusion_matrix.py
+++ b/pytorch_lightning/metrics/functional/confusion_matrix.py
@@ -33,11 +33,11 @@ def _confusion_matrix_update(
 
 
 def _confusion_matrix_compute(confmat: torch.Tensor, normalize: Optional[str] = None) -> torch.Tensor:
-    allowed_normalize = ('true', 'pred', 'all', None)
+    allowed_normalize = ('true', 'pred', 'all', 'none', None)
     assert normalize in allowed_normalize, \
         f"Argument average needs to one of the following: {allowed_normalize}"
     confmat = confmat.float()
-    if normalize is not None:
+    if normalize is not None and normalize != 'none':
         if normalize == 'true':
             cm = confmat / confmat.sum(axis=1, keepdim=True)
         elif normalize == 'pred':
@@ -61,28 +61,28 @@ def confusion_matrix(
 ) -> torch.Tensor:
     """
     Computes the confusion matrix. Works with binary, multiclass, and multilabel data.
-    Accepts logits from a model output or integer class values in prediction.
+    Accepts probabilities from a model output or integer class values in prediction.
     Works with multi-dimensional preds and target.
 
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
 
     Args:
         preds: (float or long tensor), Either a ``(N, ...)`` tensor with labels or
-            ``(N, C, ...)`` where C is the number of classes, tensor with logits/probabilities
+            ``(N, C, ...)`` where C is the number of classes, tensor with labels/probabilities
         target: ``target`` (long tensor), tensor with shape ``(N, ...)`` with ground true labels
         num_classes: Number of classes in the dataset.
         normalize: Normalization mode for confusion matrix. Choose from
 
-            - ``None``: no normalization (default)
+            - ``None`` or ``'none'``: no normalization (default)
             - ``'true'``: normalization over the targets (most commonly used)
             - ``'pred'``: normalization over the predictions
             - ``'all'``: normalization over the whole matrix
 
         threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
+            Threshold value for binary or multi-label probabilities. default: 0.5
 
     Example:
 
diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py
index 07633e8174db1..debb6c8285fc9 100755
--- a/pytorch_lightning/metrics/functional/f_beta.py
+++ b/pytorch_lightning/metrics/functional/f_beta.py
@@ -64,28 +64,28 @@ def fbeta(
     Computes f_beta metric.
 
     Works with binary, multiclass, and multilabel data.
-    Accepts logits from a model output or integer class values in prediction.
+    Accepts probabilities from a model output or integer class values in prediction.
     Works with multi-dimensional preds and target.
 
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
 
     Args:
-        preds: estimated probabilities
-        target: ground-truth labels
+        preds: predictions from model (probabilities, or labels)
+        target: ground truth labels
         num_classes: Number of classes in the dataset.
         beta: Beta coefficient in the F measure.
         threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
+            Threshold value for binary or multi-label probabilities. default: 0.5
 
         average:
             - ``'micro'`` computes metric globally
             - ``'macro'`` computes metric for each class and uniformly averages them
             - ``'weighted'`` computes metric for each class and does a weighted-average,
               where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` computes and returns the metric per class
+            - ``'none'`` or ``None`` computes and returns the metric per class
 
         multilabel: If predictions are from multilabel classification.
 
@@ -117,27 +117,27 @@ def f1(
     precision and recall scores.
 
     Works with binary, multiclass, and multilabel data.
-    Accepts logits from a model output or integer class values in prediction.
+    Accepts probabilities from a model output or integer class values in prediction.
     Works with multi-dimensional preds and target.
 
-    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument.
-    This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``.
 
     Args:
-        preds: estimated probabilities
-        target: ground-truth labels
+        preds: predictions from model (probabilities, or labels)
+        target: ground truth labels
         num_classes: Number of classes in the dataset.
         threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
+            Threshold value for binary or multi-label probabilities. default: 0.5
 
         average:
             - ``'micro'`` computes metric globally
             - ``'macro'`` computes metric for each class and uniformly averages them
             - ``'weighted'`` computes metric for each class and does a weighted-average,
               where each class is weighted by their support (accounts for class imbalance)
-            - ``'none'`` computes and returns the metric per class
+            - ``'none'`` or ``None`` computes and returns the metric per class
 
         multilabel: If predictions are from multilabel classification.
 
diff --git a/pytorch_lightning/metrics/functional/iou.py b/pytorch_lightning/metrics/functional/iou.py
index 1f539215ccd59..a716bcbdc4a11 100644
--- a/pytorch_lightning/metrics/functional/iou.py
+++ b/pytorch_lightning/metrics/functional/iou.py
@@ -63,15 +63,15 @@ def iou(
 
     Note that it is different from box IoU.
 
-    If pred and target are the same shape and pred is a float tensor,
-    we use the ``threshold`` argument. This is the case for binary and multi-label logits.
+    If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument
+    to convert into integer labels. This is the case for binary and multi-label probabilities.
 
     If pred has an extra dimension as in the case of multi-class scores we
     perform an argmax on ``dim=1``.
 
     Args:
-        pred: Tensor containing integer predictions, with shape [N, d1, d2, ...]
-        target: Tensor containing integer targets, with shape [N, d1, d2, ...]
+        preds: tensor containing predictions from model (probabilities, or labels) with shape ``[N, d1, d2, ...]``
+        target: tensor containing ground truth labels with shape ``[N, d1, d2, ...]``
         ignore_index: optional int specifying a target class to ignore. If given,
             this class index does not contribute to the returned score, regardless
             of reduction method. Has no effect if given an int that is not in the
@@ -83,7 +83,7 @@ def iou(
             [0, 0] for `pred`, and [0, 2] for `target`, then class 1 would be
             assigned the `absent_score`.
         threshold:
-            Threshold value for binary or multi-label logits. default: 0.5
+            Threshold value for binary or multi-label probabilities. default: 0.5
         num_classes:
             Optionally specify the number of classes
         reduction: a method to reduce metric score over labels.
diff --git a/pytorch_lightning/metrics/functional/precision_recall_curve.py b/pytorch_lightning/metrics/functional/precision_recall_curve.py
index 4eab13e6bbb88..57d8392dce549 100644
--- a/pytorch_lightning/metrics/functional/precision_recall_curve.py
+++ b/pytorch_lightning/metrics/functional/precision_recall_curve.py
@@ -160,6 +160,8 @@ def precision_recall_curve(
     Computes precision-recall pairs for different thresholds.
 
     Args:
+        preds: predictions from model (probabilities)
+        target: ground truth labels
         num_classes: integer with number of classes. Not nessesary to provide
             for binary problems.
         pos_label: integer determining the positive class. Default is ``None``
diff --git a/pytorch_lightning/metrics/functional/roc.py b/pytorch_lightning/metrics/functional/roc.py
index 16ecf18b91e11..5c4cb7c9de927 100644
--- a/pytorch_lightning/metrics/functional/roc.py
+++ b/pytorch_lightning/metrics/functional/roc.py
@@ -89,6 +89,8 @@ def roc(
     Computes the Receiver Operating Characteristic (ROC).
 
     Args:
+        preds: predictions from model (logits or probabilities)
+        target: ground truth values
         num_classes: integer with number of classes. Not nessesary to provide
             for binary problems.
         pos_label: integer determining the positive class. Default is ``None``

From 44958ad964f2739032d0d59c3c5a1f66d3e74560 Mon Sep 17 00:00:00 2001
From: Teddy Koker <teddy.koker@gmail.com>
Date: Thu, 11 Feb 2021 09:32:12 -0500
Subject: [PATCH 09/34] forward cache fix (#5895)

---
 pytorch_lightning/core/step_result.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 010b4429792e0..c227c039d2bca 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -297,7 +297,7 @@ def get_batch_log_metrics(self, include_forked_originals=True, add_dataloader_id
             dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx)
 
             if options['logger'] and options['on_step']:
-                if isinstance(self[k], Metric):
+                if isinstance(self[k], Metric) and self[k]._forward_cache is not None:
                     result[dl_key] = self[k]._forward_cache.detach()
                 else:
                     result[dl_key] = self[k]
@@ -406,7 +406,7 @@ def get_batch_pbar_metrics(self, include_forked_originals=True, add_dataloader_i
             dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx)
 
             if options['prog_bar'] and options['on_step']:
-                if isinstance(self[k], Metric):
+                if isinstance(self[k], Metric) and self[k]._forward_cache is not None:
                     result[dl_key] = self[k]._forward_cache
                 else:
                     result[dl_key] = self[k]

From 414aa5d3455d9a7416cb3917dbce4e9b669b663f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 11 Feb 2021 15:32:14 +0100
Subject: [PATCH 10/34] Delete unused autopep8 config (#5904)

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 331e247839145..e8a3213f2b738 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,10 +4,6 @@ requires = [
     "wheel",
 ]
 
-[tool.autopep8]
-max_line_length = 120
-ignore = ["W503", "W504", "E402", "E731", "C40", "E741", "F40", "F841"]
-
 [tool.isort]
 known_first_party = [
     "benchmarks",

From 31da16344c2e842fe01f75037033d79369dbb3da Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Thu, 11 Feb 2021 15:32:32 +0100
Subject: [PATCH 11/34] add docs (#5902)

---
 docs/source/common/lightning_module.rst | 12 ++++++++
 pytorch_lightning/core/lightning.py     | 40 +++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index 30842deba8b04..b311507a860a7 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -770,6 +770,18 @@ validation_epoch_end
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.validation_epoch_end
     :noindex:
 
+write_prediction
+~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.write_prediction
+    :noindex:
+
+write_prediction_dict
+~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.write_prediction_dict
+    :noindex:
+
 ------------
 
 Properties
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 900c7e1e208fb..e84be73e41acf 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -350,10 +350,46 @@ def log_dict(
                 tbptt_reduce_fx=tbptt_reduce_fx,
             )
 
-    def write_prediction(self, name, value, filename='predictions.pt'):
+    def write_prediction(
+        self, name: str, value: Union[torch.Tensor, List[torch.Tensor]], filename: str = 'predictions.pt'
+    ):
+        """
+        Write predictions to disk using ``torch.save``
+
+        Example::
+
+            self.write_prediction('pred', torch.tensor(...), filename='my_predictions.pt')
+
+        Args:
+            name: a string indicating the name to save the predictions under
+            value: the predictions, either a single :class:`~torch.Tensor` or a list of them
+            filename: name of the file to save the predictions to
+
+        Note:
+            when running in distributed mode, calling ``write_prediction`` will create a file for
+            each device with respective names: ``filename_rank_0.pt``, ``filename_rank_1.pt``, ...
+
+        """
         self.trainer.evaluation_loop.predictions._add_prediction(name, value, filename)
 
-    def write_prediction_dict(self, predictions_dict, filename='predictions.pt'):
+    def write_prediction_dict(self, predictions_dict: Dict[str, Any], filename: str = 'predictions.pt'):
+        """
+        Write a dictonary of predictions to disk at once using ``torch.save``
+
+        Example::
+
+            pred_dict = {'pred1': torch.tensor(...), 'pred2': torch.tensor(...)}
+            self.write_prediction_dict(pred_dict)
+
+        Args:
+            predictions_dict: dict containing predictions, where each prediction should
+                either be single :class:`~torch.Tensor` or a list of them
+
+        Note:
+            when running in distributed mode, calling ``write_prediction_dict`` will create a file for
+            each device with respective names: ``filename_rank_0.pt``, ``filename_rank_1.pt``, ...
+
+        """
         for k, v in predictions_dict.items():
             self.write_prediction(k, v, filename)
 

From 253e57c2c24ce4fbf5b100d1a84fea3a3ebe24a8 Mon Sep 17 00:00:00 2001
From: Teddy Koker <teddy.koker@gmail.com>
Date: Thu, 11 Feb 2021 09:32:41 -0500
Subject: [PATCH 12/34] Feature: LightningDataModule.from_datasets(...) (#5133)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add class method

* add tests

* docstring

* pep

* Add type annotations

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>

* pep

* fix import

* remove num_workers inference

* Update pytorch_lightning/core/datamodule.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* Update pytorch_lightning/core/datamodule.py

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>

* Update pytorch_lightning/core/datamodule.py

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>

* fix syntax

* typing fix

* list -> sequence

* list -> sequence

* missing import

* fix test

Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 pytorch_lightning/core/datamodule.py | 70 +++++++++++++++++++++++++---
 tests/core/test_datamodules.py       | 61 ++++++++++++++++++++++--
 2 files changed, 120 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py
index f46c945a0de76..ecf5a99e703c9 100644
--- a/pytorch_lightning/core/datamodule.py
+++ b/pytorch_lightning/core/datamodule.py
@@ -15,12 +15,13 @@
 
 import functools
 import inspect
+import os
 from abc import abstractmethod
 from argparse import ArgumentParser, Namespace
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, List, Optional, Tuple, Union, Dict, Sequence, Mapping
 
 import torch
-from torch.utils.data import DataLoader
+from torch.utils.data import DataLoader, Dataset
 
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
 from pytorch_lightning.utilities import parsing, rank_zero_only
@@ -35,9 +36,9 @@ def __init__(self, *args, **kwargs):
     def __call__(cls, *args, **kwargs):
         """A wrapper for LightningDataModule that:
 
-            1. Runs user defined subclass's __init__
-            2. Assures prepare_data() runs on rank 0
-            3. Lets you check prepare_data and setup to see if they've been called
+        1. Runs user defined subclass's __init__
+        2. Assures prepare_data() runs on rank 0
+        3. Lets you check prepare_data and setup to see if they've been called
         """
         if not cls.__has_added_checks:
             cls.__has_added_checks = True
@@ -266,8 +267,7 @@ def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any:
 
     @classmethod
     def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser:
-        r"""Extends existing argparse by default `LightningDataModule` attributes.
-        """
+        r"""Extends existing argparse by default `LightningDataModule` attributes."""
         parser = ArgumentParser(parents=[parent_parser], add_help=False)
         added_args = [x.dest for x in parser._actions]
 
@@ -364,3 +364,59 @@ def get_init_arguments_and_types(cls) -> List[Tuple[str, Tuple, Any]]:
             name_type_default.append((arg, arg_types, arg_default))
 
         return name_type_default
+
+    @classmethod
+    def from_datasets(
+        cls,
+        train_dataset: Optional[Union[Dataset, Sequence[Dataset], Mapping[str, Dataset]]] = None,
+        val_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
+        test_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None,
+        batch_size: int = 1,
+        num_workers: int = 0,
+    ):
+        r"""
+        Create an instance from torch.utils.data.Dataset.
+
+        Args:
+            train_dataset: (optional) Dataset to be used for train_dataloader()
+            val_dataset: (optional) Dataset or list of Dataset to be used for val_dataloader()
+            test_dataset: (optional) Dataset or list of Dataset to be used for test_dataloader()
+            batch_size: Batch size to use for each dataloader. Default is 1.
+            num_workers: Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
+                number of CPUs available.
+
+        """
+        def dataloader(ds, shuffle=False):
+            return DataLoader(
+                ds,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                num_workers=num_workers,
+                pin_memory=True,
+            )
+
+        def train_dataloader():
+            if isinstance(train_dataset, Mapping):
+                return {key: dataloader(ds, shuffle=True) for key, ds in train_dataset.items()}
+            if isinstance(train_dataset, Sequence):
+                return [dataloader(ds, shuffle=True)  for ds in train_dataset]
+            return dataloader(train_dataset, shuffle=True)
+
+        def val_dataloader():
+            if isinstance(val_dataset, Sequence):
+                return [dataloader(ds) for ds in val_dataset]
+            return dataloader(val_dataset)
+
+        def test_dataloader():
+            if isinstance(test_dataset, Sequence):
+                return [dataloader(ds) for ds in test_dataset]
+            return dataloader(test_dataset)
+
+        datamodule = cls()
+        if train_dataset is not None:
+            datamodule.train_dataloader = train_dataloader
+        if val_dataset is not None:
+            datamodule.val_dataloader = val_dataloader
+        if test_dataset is not None:
+            datamodule.test_dataloader = test_dataloader
+        return datamodule
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 76fdca0fedd48..a5c7c1cab3ee7 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import pickle
 from argparse import ArgumentParser
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 from unittest.mock import MagicMock
 
 import pytest
@@ -419,7 +419,6 @@ def test_step_end(self, outputs):
 def test_dm_transfer_batch_to_device(tmpdir):
 
     class CustomBatch:
-
         def __init__(self, data):
             self.samples = data[0]
             self.targets = data[1]
@@ -452,6 +451,28 @@ def transfer_batch_to_device(self, data, device):
     assert batch_gpu.samples.device == batch_gpu.targets.device == expected
 
 
+class CustomMNISTDataModule(LightningDataModule):
+    def __init__(self, data_dir: str = "./"):
+        super().__init__()
+        self.data_dir = data_dir
+        self._epochs_called_for = []
+
+    def prepare_data(self):
+        TrialMNIST(self.data_dir, train=True, download=True)
+
+    def setup(self, stage: Optional[str] = None):
+
+        mnist_full = TrialMNIST(root=self.data_dir, train=True, num_samples=64, download=True)
+        self.mnist_train, self.mnist_val = random_split(mnist_full, [128, 64])
+        self.dims = self.mnist_train[0][0].shape
+
+    def train_dataloader(self):
+        assert self.trainer.current_epoch not in self._epochs_called_for
+        self._epochs_called_for.append(self.trainer.current_epoch)
+
+        return DataLoader(self.mnist_train, batch_size=4)
+
+
 def test_dm_reload_dataloaders_every_epoch(tmpdir):
     """Test datamodule, where trainer argument
     reload_dataloaders_every_epoch is set to True/False"""
@@ -483,5 +504,37 @@ def train_dataloader(self):
         limit_train_batches=0.01,
         reload_dataloaders_every_epoch=True,
     )
-    results = trainer.fit(model, dm)
-    assert results
+    trainer.fit(model, dm)
+
+
+class DummyDS(torch.utils.data.Dataset):
+    def __getitem__(self, index):
+        return 1
+
+    def __len__(self):
+        return 100
+
+
+def test_dm_init_from_datasets(tmpdir):
+
+    train_ds = DummyDS()
+    valid_ds = DummyDS()
+    test_ds = DummyDS()
+
+    valid_dss = [DummyDS(), DummyDS()]
+    test_dss = [DummyDS(), DummyDS()]
+
+    dm = LightningDataModule.from_datasets(train_ds, batch_size=4, num_workers=0)
+    assert torch.all(next(iter(dm.train_dataloader())) == torch.ones(4))
+    assert dm.val_dataloader() is None
+    assert dm.test_dataloader() is None
+
+    dm = LightningDataModule.from_datasets(train_ds, valid_ds, test_ds, batch_size=4, num_workers=0)
+    assert torch.all(next(iter(dm.val_dataloader())) == torch.ones(4))
+    assert torch.all(next(iter(dm.test_dataloader())) == torch.ones(4))
+
+    dm = LightningDataModule.from_datasets(train_ds, valid_dss, test_dss, batch_size=4, num_workers=0)
+    assert torch.all(next(iter(dm.val_dataloader()[0])) == torch.ones(4))
+    assert torch.all(next(iter(dm.val_dataloader()[1])) == torch.ones(4))
+    assert torch.all(next(iter(dm.test_dataloader()[0])) == torch.ones(4))
+    assert torch.all(next(iter(dm.test_dataloader()[1])) == torch.ones(4))

From 9f12ca095ab6e3295bd03fd1e50130a12b11569c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Thu, 11 Feb 2021 15:32:45 +0100
Subject: [PATCH 13/34] More EpochResultStore refactors! :tada: (#5522)

Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .../logger_connector/epoch_result_store.py    | 136 +++++++-----------
 pytorch_lightning/trainer/trainer.py          |   2 +-
 .../dynamic_args/test_multiple_optimizers.py  | 121 ----------------
 .../trainer/logging_/test_logger_connector.py |  18 +--
 .../optimization/test_multiple_optimizers.py  |  96 ++++++++++++-
 5 files changed, 147 insertions(+), 226 deletions(-)
 delete mode 100644 tests/trainer/dynamic_args/test_multiple_optimizers.py

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 469a2777128f9..84f88fb9840f2 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
@@ -128,50 +128,28 @@ def get_epoch_log_metrics(self, *_, **__) -> List[Dict]:
     def get_forked_metrics(self, *_, **__) -> List[Dict]:
         return self.get_epoch_from_func_name("get_forked_metrics")
 
-    @staticmethod
-    def _append_to_structure(primary_dict, opt_idx, batch_idx, result) -> None:
-        primary_dict.setdefault(opt_idx, {})
-        primary_dict[opt_idx].setdefault(batch_idx, [])
-        primary_dict[opt_idx][batch_idx].append(result)
+    def append(self, result: Result, info: Dict) -> None:
+        dataloader_idx = info["dataloader_idx"]
+        self._internal_type = info["type"]
+        opt_idx = info["opt_idx"]
 
-    def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optional[dict] = None) -> None:
-        if not isinstance(result, Result):
-            raise TypeError(f'{result} must be Result')
-
-        if dataloader_idx is None:
-            dataloader_idx = 0
-
-        if extra_info is None:
-            extra_info = {}
-
-        # [dataloader_idx][optimizer_idx][training_step_idx] is a list
-        if len(extra_info) > 0:
-            self._internal_type = ResultStoreType.INSIDE_BATCH_TRAIN_LOOP
-            # initialize dictionary
+        if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP:
             if dataloader_idx not in self._internals:
-                self._internals[dataloader_idx] = {}
                 self._internals_reduced[dataloader_idx] = defaultdict(dict)
                 self._latest_ref[dataloader_idx] = {}
+            self._internals.setdefault(dataloader_idx, {})
 
-            # extract infos
-            opt_idx = extra_info["opt_idx"]
-            batch_idx = extra_info["batch_idx"]
-
-            self._append_to_structure(self._internals[dataloader_idx], opt_idx, batch_idx, result)
-
-            self._latest_ref[dataloader_idx][opt_idx] = result
-
-        # [dataloader_idx] is a list
+            batch_idx = info["batch_idx"]
+            self._internals[dataloader_idx].setdefault(opt_idx, {})
+            self._internals[dataloader_idx][opt_idx].setdefault(batch_idx, [])
+            self._internals[dataloader_idx][opt_idx][batch_idx].append(result)
         else:
-            self._internal_type = ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP
             self._internals.setdefault(dataloader_idx, [])
             self._internals[dataloader_idx].append(result)
+            self._latest_ref.setdefault(dataloader_idx, {})
 
-            if dataloader_idx not in self._latest_ref:
-                self._latest_ref[dataloader_idx] = {}
-                self._latest_ref[dataloader_idx][0] = {}
-
-            self._latest_ref[dataloader_idx][0] = result
+        self._latest_ref[dataloader_idx].setdefault(opt_idx, {})
+        self._latest_ref[dataloader_idx][opt_idx] = result
 
     def auto_reduce_results_on_epoch_end(self) -> None:
         """
@@ -188,36 +166,32 @@ def auto_reduce_results_on_epoch_end(self) -> None:
                 for opt_idx in list(epoch_metrics):
                     # TODO: Figure out to reduce memory
                     # TODO: How to start training in middle of epoch
-                    opt_outputs = epoch_metrics[opt_idx]
-
+                    outputs = epoch_metrics[opt_idx]
                     # reduce across time first
                     time_reduced_outputs = []
-                    for batch_idx in opt_outputs.keys():
-                        tbptt_outs = opt_outputs[batch_idx]
-                        tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs)
-                        if len(tbptt_outs) > 1:
-                            time_reduced_outputs.append(tbptt_outs)
+                    for tbptt_outputs in outputs.values():
+                        tbptt_outputs = type(tbptt_outputs[0]).reduce_across_time(tbptt_outputs)
+                        if len(tbptt_outputs) > 1:
+                            time_reduced_outputs.append(tbptt_outputs)
 
                     if len(time_reduced_outputs) == 0:
                         continue
 
                     # reduce across training steps
-                    opt_outputs = time_reduced_outputs[0].__class__.reduce_on_epoch_end(time_reduced_outputs)
+                    outputs = type(time_reduced_outputs[0]).reduce_on_epoch_end(time_reduced_outputs)
 
                     # with manual opt need 1 + metrics because meta is always there
-                    if opt_outputs.minimize is not None:
-                        opt_outputs.minimize = opt_outputs.minimize.mean()
+                    if outputs.minimize is not None:
+                        outputs.minimize = outputs.minimize.mean()
 
-                    self._internals_reduced[dl_idx][opt_idx] = opt_outputs
+                    self._internals_reduced[dl_idx][opt_idx] = outputs
 
                     # free memory
                     del self._internals[dl_idx][opt_idx]
             else:
-                # no need to reduce as called only once
-                if len(epoch_metrics) == 1:
-                    reduced_epoch_metrics = epoch_metrics[0]
-                else:
-                    reduced_epoch_metrics = epoch_metrics[0].__class__.reduce_on_epoch_end(epoch_metrics)
+                reduced_epoch_metrics = epoch_metrics[0]
+                if len(epoch_metrics) != 1:
+                    reduced_epoch_metrics = type(reduced_epoch_metrics).reduce_on_epoch_end(epoch_metrics)
 
                 self._internals_reduced[dl_idx] = reduced_epoch_metrics
 
@@ -257,18 +231,22 @@ def __getitem__(self, key: str) -> Any:
         return self._internals.get(key, None)
 
     @property
-    def has_split_and_opt_idx(self):
-        """
-        This function informs if we are running within training batch loop
-        """
-        return self._split_idx is not None and self._opt_idx is not None
-
-    @property
-    def extra_info(self):
+    def info(self):
         """
         This function provides necessary parameters to properly configure HookResultStore obj
         """
-        return {"batch_idx": self.trainer.batch_idx, "split_idx": self._split_idx, "opt_idx": self._opt_idx}
+        model_ref = self.trainer.get_model()
+        return {
+            "batch_idx": self.trainer.batch_idx,
+            "fx_name": model_ref._current_hook_fx_name or model_ref._current_fx_name,
+            "dataloader_idx": model_ref._current_dataloader_idx or 0,
+            "opt_idx": self._opt_idx or 0,
+            "split_idx": self._split_idx or 0,
+            "type": (
+                ResultStoreType.INSIDE_BATCH_TRAIN_LOOP if self._opt_idx is not None and self._split_idx is not None
+                else ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP
+            )
+        }
 
     def reset_model(self):
         """
@@ -279,17 +257,6 @@ def reset_model(self):
         model_ref._current_hook_fx_name = None
         model_ref._current_fx_name = ''
 
-    def current_model_info(self):
-        """
-        This function is used to extract
-        information related to current function scoping `self.log` call.
-        """
-        model_ref = self.trainer.get_model()
-        # extract hook information
-        fx_name = model_ref._current_hook_fx_name or model_ref._current_fx_name
-        dataloader_idx = model_ref._current_dataloader_idx
-        return fx_name, dataloader_idx
-
     def cache_result(self) -> None:
         """
         This function is called after every hook
@@ -306,13 +273,11 @@ def cache_result(self) -> None:
                 model_ref._current_fx_name = ''
                 return
 
-            # extract model information
-            fx_name, dataloader_idx = self.current_model_info()
+            info = self.info
+            fx_name = info["fx_name"]
 
             self._internals.setdefault(fx_name, HookResultStore(fx_name))
 
-            extra_info = self.extra_info if self.has_split_and_opt_idx else {}
-
             # attach capture batch_size
             Result.attach_batch_size(self._batch_size, hook_result)
 
@@ -322,16 +287,15 @@ def cache_result(self) -> None:
             elif self.trainer._distrib_type == DistributedType.DP:
                 hook_result.to(torch.device("cuda", self.trainer.root_gpu))
 
-            self._internals[fx_name].append(hook_result, dataloader_idx=dataloader_idx, extra_info=extra_info)
+            self._internals[fx_name].append(hook_result, info)
 
             # update logged_metrics, progress_bar_metrics, callback_metrics
-
             if "epoch_end" in fx_name:
                 self.update_logger_connector()
 
             self.reset_model()
 
-    def update_logger_connector(self) -> None:
+    def update_logger_connector(self) -> Tuple[Dict, Dict]:
         """
         This function is called every time we capture a hook
         It automatically updates the logger_connector followings:
@@ -483,24 +447,24 @@ def __call__(
 
         Example::
 
-            result: Result = self(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True)
+            result: Result = self(fx_name="training_step", dl_idx=0, opt_idx=0, reduced=True)
             result['train_loss_epoch'] # aggregated train_loss over one epoch.
 
         Args:
 
-            fx_name: Hook name from ModelHooks or Callback. Example: `training_step`
+            fx_name: Hook name from ModelHooks or Callback. Example: ``"training_step"``
 
-            dl_idx: Dataloader idx in short. It starts from 0 to num_dataloaders - 1
+            dl_idx: Dataloader index in short. From ``0`` to ``num_dataloaders - 1``
 
-            opt_idx: Optimizer idx in short. It starts from 0 to num_optimizers - 1
+            opt_idx: Optimizer index in short. From ``0`` to ``num_optimizers - 1``
 
-            batch_idx: Index of batch idx seen during batch training or evaluation.
-                Works only with reduced=False
+            batch_idx: Batch index seen during batch training or evaluation.
+                Works only with ``reduced=False``
 
             split_idx: Index of split idx in training loop when ttbt is used.
 
             reduced: Data are being aggregated on on_epoch_end.
-                Indicates if we want to access aggregated Result or not.
+                Indicates if we want to access the aggregated Result or not.
         """
         hook_result = self[fx_name]
         internal_type = hook_result._internal_type
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 60e1456560a37..184f5c41b878b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1031,7 +1031,7 @@ def call_hook(self, hook_name, *args, **kwargs):
                 hook_fx = getattr(model_ref, hook_name)
                 output = hook_fx(*args, **kwargs)
 
-            # if the PL module doesn't have the hook then call the accelator
+            # if the PL module doesn't have the hook then call the accelerator
             # used to auto-reduce things for the user with Results obj
             elif hasattr(self.accelerator_backend, hook_name):
                 accelerator_hook = getattr(self.accelerator_backend, hook_name)
diff --git a/tests/trainer/dynamic_args/test_multiple_optimizers.py b/tests/trainer/dynamic_args/test_multiple_optimizers.py
deleted file mode 100644
index 3b35ac3aa67eb..0000000000000
--- a/tests/trainer/dynamic_args/test_multiple_optimizers.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-
-from pytorch_lightning import Trainer
-from tests.helpers.boring_model import BoringModel
-
-
-def test_multiple_optimizers(tmpdir):
-    """
-    Tests that only training_step can be used
-    """
-
-    class TestModel(BoringModel):
-
-        def on_train_epoch_start(self) -> None:
-            self.opt_0_seen = False
-            self.opt_1_seen = False
-
-        def training_step(self, batch, batch_idx, optimizer_idx):
-            if optimizer_idx == 0:
-                self.opt_0_seen = True
-            elif optimizer_idx == 1:
-                self.opt_1_seen = True
-            else:
-                raise Exception('should only have two optimizers')
-
-            self.training_step_called = True
-            loss = self.step(batch[0])
-            return loss
-
-        def training_epoch_end(self, outputs) -> None:
-            # outputs should be an array with an entry per optimizer
-            assert len(outputs) == 2
-
-        def configure_optimizers(self):
-            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            return optimizer, optimizer_2
-
-    model = TestModel()
-    model.val_dataloader = None
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=2,
-        limit_val_batches=2,
-        max_epochs=1,
-        log_every_n_steps=1,
-        weights_summary=None,
-    )
-
-    trainer.fit(model)
-    assert model.opt_0_seen
-    assert model.opt_1_seen
-
-
-def test_multiple_optimizers_manual(tmpdir):
-    """
-    Tests that only training_step can be used
-    """
-
-    class TestModel(BoringModel):
-
-        def __init__(self):
-            super().__init__()
-            self.automatic_optimization = False
-
-        def on_train_epoch_start(self) -> None:
-            self.opt_0_seen = False
-            self.opt_1_seen = False
-
-        def training_step(self, batch, batch_idx, optimizer_idx):
-            # manual
-            (opt_a, opt_b) = self.optimizers()
-            loss_1 = self.step(batch[0])
-
-            # fake generator
-            self.manual_backward(loss_1, opt_a)
-            opt_a.step()
-            opt_a.zero_grad()
-
-            # fake discriminator
-            loss_2 = self.step(batch[0])
-            self.manual_backward(loss_2, opt_b)
-            opt_b.step()
-            opt_b.zero_grad()
-
-        def training_epoch_end(self, outputs) -> None:
-            # outputs should be an array with an entry per optimizer
-            assert len(outputs) == 2
-
-        def configure_optimizers(self):
-            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            return optimizer, optimizer_2
-
-    model = TestModel()
-    model.val_dataloader = None
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        limit_train_batches=2,
-        limit_val_batches=2,
-        max_epochs=1,
-        log_every_n_steps=1,
-        weights_summary=None,
-    )
-
-    trainer.fit(model)
diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
index f9b0459ecc3c0..92eb2c76a8c6b 100644
--- a/tests/trainer/logging_/test_logger_connector.py
+++ b/tests/trainer/logging_/test_logger_connector.py
@@ -208,22 +208,18 @@ def training_step_end(self, *_):
 @pytest.mark.parametrize('num_dataloaders', [1, 2])
 def test__logger_connector__epoch_result_store__test_multi_dataloaders(tmpdir, monkeypatch, num_dataloaders):
     """
-    Tests that LoggerConnector will properly capture logged information in multi_dataloaders scenario
+    Tests that LoggerConnector will properly capture logged information in multi dataloaders scenario
     """
     monkeypatch.setenv("PL_DEV_DEBUG", "1")
 
     class TestModel(BoringModel):
-
-        test_losses = {}
+        test_losses = {dl_idx: [] for dl_idx in range(num_dataloaders)}
 
         @decorator_with_arguments(fx_name="test_step")
         def test_step(self, batch, batch_idx, dl_idx=0):
             output = self.layer(batch)
             loss = self.loss(batch, output)
-
-            self.test_losses.setdefault(dl_idx, [])
             self.test_losses[dl_idx].append(loss)
-
             self.log("test_loss", loss, on_step=True, on_epoch=True)
             return {"test_loss": loss}
 
@@ -236,12 +232,10 @@ def on_test_epoch_end(self):
             self.reduce_results = deepcopy(self.trainer.logger_connector.cached_results)
 
         def test_dataloader(self):
-            return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)]
+            return [super().test_dataloader()] * num_dataloaders
 
     model = TestModel()
-    model.val_dataloader = None
     model.test_epoch_end = None
-
     limit_test_batches = 4
 
     trainer = Trainer(
@@ -261,15 +255,15 @@ def test_dataloader(self):
     assert len(generated) == num_dataloaders
 
     for dl_idx in range(num_dataloaders):
-        generated = len(test_results(fx_name="test_step", dl_idx=dl_idx))
-        assert generated == limit_test_batches
+        generated = test_results(fx_name="test_step", dl_idx=dl_idx)
+        assert len(generated) == limit_test_batches
 
     test_results = model.reduce_results
 
     for dl_idx in range(num_dataloaders):
         expected = torch.stack(model.test_losses[dl_idx]).mean()
         generated = test_results(fx_name="test_step", dl_idx=dl_idx, reduced=True)["test_loss_epoch"]
-        assert abs(expected.item() - generated.item()) < 1e-6
+        torch.testing.assert_allclose(generated, expected)
 
 
 def test_call_back_validator(tmpdir):
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
index 5df5cdc01fdc4..84fdeab2c1311 100644
--- a/tests/trainer/optimization/test_multiple_optimizers.py
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -20,13 +20,22 @@
 from tests.helpers.boring_model import BoringModel
 
 
+class MultiOptModel(BoringModel):
+
+    def configure_optimizers(self):
+        opt_a = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+        opt_b = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+        return opt_a, opt_b
+
+
 def test_unbalanced_logging_with_multiple_optimizers(tmpdir):
     """
     This tests ensures reduction works in unbalanced logging settings,
     even when a Callback also logs.
     """
 
-    class TestModel(BoringModel):
+    class TestModel(MultiOptModel):
+
         actual = {0: [], 1: []}
 
         def training_step(self, batch, batch_idx, optimizer_idx):
@@ -36,11 +45,6 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             self.actual[optimizer_idx].append(loss)
             return out
 
-        def configure_optimizers(self):
-            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
-            optimizer2 = torch.optim.SGD(self.layer.parameters(), lr=0.001)
-            return [optimizer, optimizer2]
-
     model = TestModel()
     model.training_epoch_end = None
 
@@ -70,3 +74,83 @@ def on_train_batch_end(self, trainer, pl_module, output, batch, batch_idx, dl_id
         torch.testing.assert_allclose(trainer.callback_metrics[f"loss_{k}_epoch"], torch.tensor(v).mean())
 
     assert trainer.callback_metrics["test_train_batch_end"] == len(model.optimizers()) - 1
+
+
+def test_multiple_optimizers(tmpdir):
+
+    class TestModel(MultiOptModel):
+
+        seen = [False, False]
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            self.seen[optimizer_idx] = True
+            return super().training_step(batch, batch_idx)
+
+        def training_epoch_end(self, outputs) -> None:
+            # outputs should be an array with an entry per optimizer
+            assert len(outputs) == 2
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            return optimizer, optimizer_2
+
+    model = TestModel()
+    model.val_dataloader = None
+
+    trainer = pl.Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=1,
+        log_every_n_steps=1,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+    assert all(model.seen)
+
+
+def test_multiple_optimizers_manual(tmpdir):
+
+    class TestModel(MultiOptModel):
+
+        def __init__(self):
+            super().__init__()
+            self.automatic_optimization = False
+
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            self.training_step_called = True
+
+            # manual optimization
+            opt_a, opt_b = self.optimizers()
+            loss_1 = self.step(batch[0])
+
+            # fake generator
+            self.manual_backward(loss_1, opt_a)
+            opt_a.step()
+            opt_a.zero_grad()
+
+            # fake discriminator
+            loss_2 = self.step(batch[0])
+            self.manual_backward(loss_2, opt_b)
+            opt_b.step()
+            opt_b.zero_grad()
+
+        def training_epoch_end(self, outputs) -> None:
+            # outputs should be an array with an entry per optimizer
+            assert len(outputs) == 2
+
+    model = TestModel()
+    model.val_dataloader = None
+
+    trainer = pl.Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        max_epochs=1,
+        log_every_n_steps=1,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+    assert model.training_step_called

From e676ff96b16224331297dbd0e5ecd5cf364965b8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 11 Feb 2021 15:33:10 +0100
Subject: [PATCH 14/34] Typing: callback base (#5919)

* typing for callback base
---
 pytorch_lightning/callbacks/base.py | 101 +++++++++++++++-------------
 1 file changed, 56 insertions(+), 45 deletions(-)

diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
index 3bcbb11dbcf0a..d9ffff1bd47e6 100644
--- a/pytorch_lightning/callbacks/base.py
+++ b/pytorch_lightning/callbacks/base.py
@@ -17,6 +17,9 @@
 """
 
 import abc
+from typing import Any
+
+from pytorch_lightning.core.lightning import LightningModule
 
 
 class Callback(abc.ABC):
@@ -26,158 +29,166 @@ class Callback(abc.ABC):
     Subclass this class and override any of the relevant hooks
     """
 
-    def on_before_accelerator_backend_setup(self, trainer, pl_module):
+    def on_before_accelerator_backend_setup(self, trainer, pl_module: LightningModule) -> None:
         """Called before accelerator is being setup"""
         pass
 
-    def setup(self, trainer, pl_module, stage: str):
+    def setup(self, trainer, pl_module: LightningModule, stage: str) -> None:
         """Called when fit or test begins"""
         pass
 
-    def teardown(self, trainer, pl_module, stage: str):
+    def teardown(self, trainer, pl_module: LightningModule, stage: str) -> None:
         """Called when fit or test ends"""
         pass
 
-    def on_init_start(self, trainer):
+    def on_init_start(self, trainer) -> None:
         """Called when the trainer initialization begins, model has not yet been set."""
         pass
 
-    def on_init_end(self, trainer):
+    def on_init_end(self, trainer) -> None:
         """Called when the trainer initialization ends, model has not yet been set."""
         pass
 
-    def on_fit_start(self, trainer, pl_module):
+    def on_fit_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when fit begins"""
         pass
 
-    def on_fit_end(self, trainer, pl_module):
+    def on_fit_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when fit ends"""
         pass
 
-    def on_sanity_check_start(self, trainer, pl_module):
+    def on_sanity_check_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the validation sanity check starts."""
         pass
 
-    def on_sanity_check_end(self, trainer, pl_module):
+    def on_sanity_check_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the validation sanity check ends."""
         pass
 
-    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+    def on_train_batch_start(
+        self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
+    ) -> None:
         """Called when the train batch begins."""
         pass
 
-    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+    def on_train_batch_end(
+        self, trainer, pl_module: LightningModule, outputs: Any, batch: Any, batch_idx: int, dataloader_idx: int
+    ) -> None:
         """Called when the train batch ends."""
         pass
 
-    def on_train_epoch_start(self, trainer, pl_module):
+    def on_train_epoch_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the train epoch begins."""
         pass
 
-    def on_train_epoch_end(self, trainer, pl_module, outputs):
+    def on_train_epoch_end(self, trainer, pl_module: LightningModule, outputs: Any) -> None:
         """Called when the train epoch ends."""
         pass
 
-    def on_validation_epoch_start(self, trainer, pl_module):
+    def on_validation_epoch_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the val epoch begins."""
         pass
 
-    def on_validation_epoch_end(self, trainer, pl_module):
+    def on_validation_epoch_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the val epoch ends."""
         pass
 
-    def on_test_epoch_start(self, trainer, pl_module):
+    def on_test_epoch_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the test epoch begins."""
         pass
 
-    def on_test_epoch_end(self, trainer, pl_module):
+    def on_test_epoch_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the test epoch ends."""
         pass
 
-    def on_epoch_start(self, trainer, pl_module):
+    def on_epoch_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the epoch begins."""
         pass
 
-    def on_epoch_end(self, trainer, pl_module):
+    def on_epoch_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the epoch ends."""
         pass
 
-    def on_batch_start(self, trainer, pl_module):
+    def on_batch_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the training batch begins."""
         pass
 
-    def on_validation_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+    def on_validation_batch_start(
+        self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
+    ) -> None:
         """Called when the validation batch begins."""
         pass
 
-    def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+    def on_validation_batch_end(
+        self, trainer, pl_module: LightningModule, outputs: Any, batch: Any, batch_idx: int, dataloader_idx: int
+    ) -> None:
         """Called when the validation batch ends."""
         pass
 
-    def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+    def on_test_batch_start(
+        self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
+    ) -> None:
         """Called when the test batch begins."""
         pass
 
-    def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+    def on_test_batch_end(
+        self, trainer, pl_module: LightningModule, outputs: Any, batch: Any, batch_idx: int, dataloader_idx: int
+    ) -> None:
         """Called when the test batch ends."""
         pass
 
-    def on_batch_end(self, trainer, pl_module):
+    def on_batch_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the training batch ends."""
         pass
 
-    def on_train_start(self, trainer, pl_module):
+    def on_train_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the train begins."""
         pass
 
-    def on_train_end(self, trainer, pl_module):
+    def on_train_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the train ends."""
         pass
 
-    def on_pretrain_routine_start(self, trainer, pl_module):
+    def on_pretrain_routine_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the pretrain routine begins."""
         pass
 
-    def on_pretrain_routine_end(self, trainer, pl_module):
+    def on_pretrain_routine_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the pretrain routine ends."""
         pass
 
-    def on_validation_start(self, trainer, pl_module):
+    def on_validation_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the validation loop begins."""
         pass
 
-    def on_validation_end(self, trainer, pl_module):
+    def on_validation_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the validation loop ends."""
         pass
 
-    def on_test_start(self, trainer, pl_module):
+    def on_test_start(self, trainer, pl_module: LightningModule) -> None:
         """Called when the test begins."""
         pass
 
-    def on_test_end(self, trainer, pl_module):
+    def on_test_end(self, trainer, pl_module: LightningModule) -> None:
         """Called when the test ends."""
         pass
 
-    def on_keyboard_interrupt(self, trainer, pl_module):
-        """Called when the training is interrupted by KeyboardInterrupt."""
+    def on_keyboard_interrupt(self, trainer, pl_module: LightningModule) -> None:
+        """Called when the training is interrupted by ``KeyboardInterrupt``."""
         pass
 
-    def on_save_checkpoint(self, trainer, pl_module):
+    def on_save_checkpoint(self, trainer, pl_module: LightningModule) -> None:
         """Called when saving a model checkpoint, use to persist state."""
         pass
 
-    def on_load_checkpoint(self, checkpointed_state):
+    def on_load_checkpoint(self, checkpointed_state) -> None:
         """Called when loading a model checkpoint, use to reload state."""
         pass
 
-    def on_after_backward(self, trainer, pl_module):
-        """
-        Called after loss.backward() and before optimizers do anything.
-        """
+    def on_after_backward(self, trainer, pl_module: LightningModule) -> None:
+        """Called after ``loss.backward()`` and before optimizers do anything."""
         pass
 
-    def on_before_zero_grad(self, trainer, pl_module, optimizer):
-        """
-        Called after optimizer.step() and before optimizer.zero_grad().
-        """
+    def on_before_zero_grad(self, trainer, pl_module: LightningModule, optimizer) -> None:
+        """Called after ``optimizer.step()`` and before ``optimizer.zero_grad()``."""
         pass

From 4857546c259042fbc9b2ddc155cd9e6b5bf5d3ff Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Thu, 11 Feb 2021 23:02:46 +0530
Subject: [PATCH 15/34] Fix: Failing test in data_modules(dp) (#5924)

* Update test_datamodules.py

* fix code format issue

* fix test restore

* fix code format issue
---
 pytorch_lightning/core/datamodule.py | 10 +++++-----
 tests/core/test_datamodules.py       | 30 +++++-----------------------
 tests/helpers/simple_models.py       |  5 +++--
 tests/models/test_restore.py         |  6 +++---
 4 files changed, 16 insertions(+), 35 deletions(-)
 mode change 100644 => 100755 tests/helpers/simple_models.py
 mode change 100644 => 100755 tests/models/test_restore.py

diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py
index ecf5a99e703c9..d0e1725b2c4ac 100644
--- a/pytorch_lightning/core/datamodule.py
+++ b/pytorch_lightning/core/datamodule.py
@@ -15,10 +15,9 @@
 
 import functools
 import inspect
-import os
 from abc import abstractmethod
 from argparse import ArgumentParser, Namespace
-from typing import Any, List, Optional, Tuple, Union, Dict, Sequence, Mapping
+from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
 
 import torch
 from torch.utils.data import DataLoader, Dataset
@@ -382,10 +381,11 @@ def from_datasets(
             val_dataset: (optional) Dataset or list of Dataset to be used for val_dataloader()
             test_dataset: (optional) Dataset or list of Dataset to be used for test_dataloader()
             batch_size: Batch size to use for each dataloader. Default is 1.
-            num_workers: Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
-                number of CPUs available.
+            num_workers: Number of subprocesses to use for data loading. 0 means that the
+                data will be loaded in the main process. Number of CPUs available.
 
         """
+
         def dataloader(ds, shuffle=False):
             return DataLoader(
                 ds,
@@ -399,7 +399,7 @@ def train_dataloader():
             if isinstance(train_dataset, Mapping):
                 return {key: dataloader(ds, shuffle=True) for key, ds in train_dataset.items()}
             if isinstance(train_dataset, Sequence):
-                return [dataloader(ds, shuffle=True)  for ds in train_dataset]
+                return [dataloader(ds, shuffle=True) for ds in train_dataset]
             return dataloader(train_dataset, shuffle=True)
 
         def val_dataloader():
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index a5c7c1cab3ee7..a83a6a41c9287 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import pickle
 from argparse import ArgumentParser
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 from unittest.mock import MagicMock
 
 import pytest
@@ -381,8 +381,8 @@ def _step(self, batch, batch_idx):
         def training_step(self, batch, batch_idx):
             _, y = batch
             out = self._step(batch, batch_idx)
-            out['loss'] = F.cross_entropy(out['logits'], y)
-            return out
+            loss = F.cross_entropy(out['logits'], y)
+            return loss
 
         def validation_step(self, batch, batch_idx):
             return self._step(batch, batch_idx)
@@ -419,6 +419,7 @@ def test_step_end(self, outputs):
 def test_dm_transfer_batch_to_device(tmpdir):
 
     class CustomBatch:
+
         def __init__(self, data):
             self.samples = data[0]
             self.targets = data[1]
@@ -451,28 +452,6 @@ def transfer_batch_to_device(self, data, device):
     assert batch_gpu.samples.device == batch_gpu.targets.device == expected
 
 
-class CustomMNISTDataModule(LightningDataModule):
-    def __init__(self, data_dir: str = "./"):
-        super().__init__()
-        self.data_dir = data_dir
-        self._epochs_called_for = []
-
-    def prepare_data(self):
-        TrialMNIST(self.data_dir, train=True, download=True)
-
-    def setup(self, stage: Optional[str] = None):
-
-        mnist_full = TrialMNIST(root=self.data_dir, train=True, num_samples=64, download=True)
-        self.mnist_train, self.mnist_val = random_split(mnist_full, [128, 64])
-        self.dims = self.mnist_train[0][0].shape
-
-    def train_dataloader(self):
-        assert self.trainer.current_epoch not in self._epochs_called_for
-        self._epochs_called_for.append(self.trainer.current_epoch)
-
-        return DataLoader(self.mnist_train, batch_size=4)
-
-
 def test_dm_reload_dataloaders_every_epoch(tmpdir):
     """Test datamodule, where trainer argument
     reload_dataloaders_every_epoch is set to True/False"""
@@ -508,6 +487,7 @@ def train_dataloader(self):
 
 
 class DummyDS(torch.utils.data.Dataset):
+
     def __getitem__(self, index):
         return 1
 
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
old mode 100644
new mode 100755
index 9288a3c802276..c33c470d043b7
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -21,7 +21,8 @@
 
 class ClassificationModel(LightningModule):
 
-    def __init__(self):
+    def __init__(self, lr=0.01):
+        self.lr = lr
         super().__init__()
         for i in range(3):
             setattr(self, f"layer_{i}", nn.Linear(32, 32))
@@ -44,7 +45,7 @@ def forward(self, x):
         return logits
 
     def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
         return [optimizer], []
 
     def training_step(self, batch, batch_idx):
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
old mode 100644
new mode 100755
index 114ebf33681dc..28e3e65a87586
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -208,8 +208,8 @@ def _step(self, batch, batch_idx):
         def training_step(self, batch, batch_idx):
             _, y = batch
             out = self._step(batch, batch_idx)
-            out['loss'] = F.cross_entropy(out['logits'], y)
-            return out
+            loss = F.cross_entropy(out['logits'], y)
+            return loss
 
         def validation_step(self, batch, batch_idx):
             return self._step(batch, batch_idx)
@@ -221,7 +221,7 @@ def validation_step_end(self, outputs):
             self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
 
     dm = ClassifDataModule()
-    model = CustomClassificationModelDP()
+    model = CustomClassificationModelDP(lr=0.1)
 
     # exp file to get meta
     logger = tutils.get_default_logger(tmpdir)

From 4bdf2fe55f45c4cc8b397d4b45041265c402519f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 12 Feb 2021 00:06:40 +0100
Subject: [PATCH 16/34] remove executable bit on source files (#5929)

* 644
---
 .circleci/config.yml                               | 0
 docs/source/_static/images/logo.svg                | 0
 docs/source/advanced/lr_finder.rst                 | 0
 pytorch_lightning/callbacks/lr_monitor.py          | 0
 pytorch_lightning/metrics/classification/f_beta.py | 0
 pytorch_lightning/metrics/functional/f_beta.py     | 0
 tests/helpers/simple_models.py                     | 0
 tests/models/test_restore.py                       | 0
 tests/trainer/test_config_validator.py             | 0
 tests/trainer/test_lr_finder.py                    | 0
 tests/trainer/test_trainer_tricks.py               | 0
 11 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 .circleci/config.yml
 mode change 100755 => 100644 docs/source/_static/images/logo.svg
 mode change 100755 => 100644 docs/source/advanced/lr_finder.rst
 mode change 100755 => 100644 pytorch_lightning/callbacks/lr_monitor.py
 mode change 100755 => 100644 pytorch_lightning/metrics/classification/f_beta.py
 mode change 100755 => 100644 pytorch_lightning/metrics/functional/f_beta.py
 mode change 100755 => 100644 tests/helpers/simple_models.py
 mode change 100755 => 100644 tests/models/test_restore.py
 mode change 100755 => 100644 tests/trainer/test_config_validator.py
 mode change 100755 => 100644 tests/trainer/test_lr_finder.py
 mode change 100755 => 100644 tests/trainer/test_trainer_tricks.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
old mode 100755
new mode 100644
diff --git a/docs/source/_static/images/logo.svg b/docs/source/_static/images/logo.svg
old mode 100755
new mode 100644
diff --git a/docs/source/advanced/lr_finder.rst b/docs/source/advanced/lr_finder.rst
old mode 100755
new mode 100644
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
old mode 100755
new mode 100644
diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py
old mode 100755
new mode 100644
diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py
old mode 100755
new mode 100644
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
old mode 100755
new mode 100644
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
old mode 100755
new mode 100644
diff --git a/tests/trainer/test_config_validator.py b/tests/trainer/test_config_validator.py
old mode 100755
new mode 100644
diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py
old mode 100755
new mode 100644
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
old mode 100755
new mode 100644

From 979c879e4515dde5db2a46dfc840517e723e0a62 Mon Sep 17 00:00:00 2001
From: Nicki Skafte <skaftenicki@gmail.com>
Date: Fri, 12 Feb 2021 17:42:32 +0100
Subject: [PATCH 17/34] drop DDP CLI test (#5938)

* fix tests

* =

Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
---
 azure-pipelines.yml                   |  2 +-
 tests/accelerators/legacy/test_ddp.py | 24 ------------------------
 2 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 17029d281713b..0bb4f5cfffd82 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -82,7 +82,7 @@ jobs:
     - bash: |
         python -m coverage report
         python -m coverage xml
-        codecov --token $(CODECOV_TOKEN) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
+        codecov --token=$(CODECOV_TOKEN) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
       displayName: 'Statistics'
 
     - script: |
diff --git a/tests/accelerators/legacy/test_ddp.py b/tests/accelerators/legacy/test_ddp.py
index 0e7d6948c1834..48cef6d595946 100644
--- a/tests/accelerators/legacy/test_ddp.py
+++ b/tests/accelerators/legacy/test_ddp.py
@@ -72,30 +72,6 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
         assert out['test_acc'] > 0.90
 
 
-# START: test_cli ddp test
-@pytest.mark.skipif(os.getenv("PL_IN_LAUNCHER", '0') == '1', reason="test runs only in DDPLauncher")
-def internal_test_cli(tmpdir, args=None):
-    """
-    This test verify we can call function using test_cli name
-    """
-
-    return 1
-
-
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-def test_cli(tmpdir):
-    DDPLauncher.run_from_cmd_line("--max_epochs 1 --gpus 2 --accelerator ddp", internal_test_cli, tmpdir)
-    # load the results of the script
-    result_path = os.path.join(tmpdir, 'ddp.result')
-    result = torch.load(result_path)
-    # verify the file wrote the expected outputs
-    assert result['status'] == 'complete'
-    assert str(result['result']) == '1'
-
-
-# END: test_cli ddp test
-
-
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @DDPLauncher.run(
     "--max_epochs [max_epochs] --gpus 2 --accelerator [accelerator]",

From ae19c9723ba2007ea28dd57c4df65eeea41be3ae Mon Sep 17 00:00:00 2001
From: Eric Cousineau <eric.cousineau@tri.global>
Date: Fri, 12 Feb 2021 12:25:08 -0500
Subject: [PATCH 18/34] tests: Remove usage of --flake8 flag (#5909)

* tests: Remove usage of --flake8 flag

* Remove commented line

Co-authored-by: Carlos Mocholi <carlossmocholi@gmail.com>
---
 .github/CONTRIBUTING.md | 2 +-
 Makefile                | 2 +-
 requirements/test.txt   | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 98321b20959b8..4eed846b28e05 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -202,7 +202,7 @@ This is useful if you do not test against all required dependency versions.
 **Docker:** Another option is utilize the [pytorch lightning cuda base docker image](https://hub.docker.com/repository/docker/pytorchlightning/pytorch_lightning/tags?page=1&name=cuda). You can then run:
 
 ```bash
-python -m pytest pytorch_lightning tests pl_examples -v --flake8
+python -m pytest pytorch_lightning tests pl_examples -v
 ```
 
 ### Pull Request
diff --git a/Makefile b/Makefile
index a659d4a4b0229..964b9ab10361b 100644
--- a/Makefile
+++ b/Makefile
@@ -23,7 +23,7 @@ test: clean
 	# install APEX, see https://github.com/NVIDIA/apex#linux
 
 	# use this to run tests
-	python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --flake8
+	python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v
 	python -m coverage report
 
 docs: clean
diff --git a/requirements/test.txt b/requirements/test.txt
index 1ca085ef2adb5..2d47143ca58d4 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -2,7 +2,6 @@ coverage>=5.0
 codecov>=2.1
 pytest>=5.0
 # pytest-cov
-# pytest-flake8
 flake8>=3.6
 check-manifest
 twine==3.2

From 309ce7a96664316b0f42147d6a849001d841559a Mon Sep 17 00:00:00 2001
From: Dusan Drevicky <55678224+ddrevicky@users.noreply.github.com>
Date: Fri, 12 Feb 2021 21:01:22 +0100
Subject: [PATCH 19/34] Fix: passing wrong strings for scheduler interval
 doesn't throw an error (#5923)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Raise if scheduler interval not 'step' or 'epoch'

* Add test for unknown 'interval' value in scheduler

* Use BoringModel instead of EvalModelTemplate

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Fix import order

* Apply yapf in test_datamodules

* Add missing imports to test_datamodules

* Fix too long comment

* Update pytorch_lightning/trainer/optimizers.py

* Fix unused imports and exception message

* Fix failing test

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 pytorch_lightning/trainer/optimizers.py       |  6 ++++++
 tests/trainer/optimization/test_optimizers.py | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 6772dcc645e3b..6793a370fdc35 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -117,6 +117,12 @@ def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
                     raise MisconfigurationException(
                         'The lr scheduler dict must have the key "scheduler" with its item being an lr scheduler'
                     )
+                if 'interval' in scheduler and scheduler['interval'] not in ('step', 'epoch'):
+                    raise MisconfigurationException(
+                        f'The "interval" key in lr scheduler dict must be "step" or "epoch"'
+                        f' but is "{scheduler["interval"]}"'
+                    )
+
                 scheduler['reduce_on_plateau'] = isinstance(
                     scheduler['scheduler'], optim.lr_scheduler.ReduceLROnPlateau
                 )
diff --git a/tests/trainer/optimization/test_optimizers.py b/tests/trainer/optimization/test_optimizers.py
index c9a9250995dd0..7172b2dca76da 100644
--- a/tests/trainer/optimization/test_optimizers.py
+++ b/tests/trainer/optimization/test_optimizers.py
@@ -459,6 +459,24 @@ def test_unknown_configure_optimizers_raises(tmpdir):
         trainer.fit(model)
 
 
+def test_lr_scheduler_with_unknown_interval_raises(tmpdir):
+    """
+    Test exception when lr_scheduler dict has unknown interval param value
+    """
+    model = BoringModel()
+    optimizer = torch.optim.Adam(model.parameters())
+    model.configure_optimizers = lambda: {
+        'optimizer': optimizer,
+        'lr_scheduler': {
+            'scheduler': torch.optim.lr_scheduler.StepLR(optimizer, 1),
+            'interval': "incorrect_unknown_value"
+        },
+    }
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    with pytest.raises(MisconfigurationException, match=r'The "interval" key in lr scheduler dict must be'):
+        trainer.fit(model)
+
+
 def test_lr_scheduler_with_extra_keys_warns(tmpdir):
     """
     Test warning when lr_scheduler dict has extra keys

From da6dbc8d1d128cf783d7151b012a5502bbd52bf5 Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 12 Feb 2021 21:48:56 +0100
Subject: [PATCH 20/34] PoC: Accelerator refactor (#5743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* restoring the result from subprocess

* fix queue.get() order for results

* add missing "block_backward_sync" context manager

* add missing "block_backward_sync" context manager

* fix sync_batchnorm

* fix supported gpu-ids for tuple

* fix clip gradients and inf recursion

* accelerator selection: added cluster_environment plugin

* fix torchelastic test

* fix reduce early stopping decision for DDP

* fix tests: callbacks, conversion to lightning optimizer

* fix lightning optimizer does not pickle

* fix setting benchmark and deterministic option

* fix slurm amp test

* fix prepare_data test and determine node_rank

* fix retrieving last path when testing

* remove obsolete plugin argument

* fix test: test_trainer_config

* fix torchscript tests

* fix trainer.model access

* move properties

* fix test_transfer_batch_hook

* fix auto_select_gpus

* fix omegaconf test

* fix test that needs to simulate slurm ddp

* add horovod plugin

* fix test with named arguments

* clean up whitespace

* fix datamodules test

* remove old accelerators

* fix naming

* move old plugins

* move to plugins

* create precision subpackage

* create training_type subpackage

* fix all new import errors

* fix wrong arguments order passed to test

* fix LR finder

* Added sharded training type and amp plugin

* Move clip grad to precision plugin

* Added sharded spawn, select accelerators based on distributed_backend + enable custom fp16 plugin automatically

* Fix import issue, attempting to fix tests

* Fix initial test

* Reflect hook logic from master, should wrap model after move to device

* Optional state consolidation, since master has optimizers not wrapped

* change attribute for instance test

* reset optimizers

optimizers are not used in main process, so state would be wrong.

* legacy

* imports in accel

* legacy2

* trainer imports

* fix import errors after rebase

* move hook to new setup location

* provide unwrapping logic

* fix trainer callback system

* added ddp2 implementation

* fix imports .legacy

* move plugins

* restore legacy

* drop test.py from root

* add tpu accelerator and plugins

* fixes

* fix lightning optimizer merge

* reset bugreportmodel

* unwrapping

* step routing forward

* model access

* unwrap

* opt

* integrate distrib_type

* sync changes

* sync

* fixes

* add forgotten generators

* add missing logic

* update

* import

* missed imports

* import fixes

* isort

* mv f

* changelog

* format

* move helper to parallel plugin

* d

* add world size

* clean up

* duplicate

* activate ddp_sharded and tpu

* set nvidia flags

* remove unused colab var

* use_tpu <-> on_tpu attrs

* make some ddp_cpu and clusterplugin tests pass

* Ref/accelerator connector (#5742)

* final cleanup

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* connector cleanup

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* trainer cleanup

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* accelerator cleanup + missing logic in accelerator connector

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* add missing changes to callbacks

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* reflect accelerator changes to lightning module

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* clean cluster envs

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* cleanup plugins

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* add broadcasting

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* yapf

* remove plugin connector

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* plugins

* manual optimization

* update optimizer routing

* add rank to torchelastic

* fix memory mixed precision

* setstate on trainer for pickling in ddp spawn

* add predict method

* add back commented accelerator code

* adapt test for sync_batch_norm to new plugin

* fix deprecated tests

* fix ddp cpu choice when no num_processes are given

* yapf format

* skip a memory test that cannot pass anymore

* fix pickle error in spawn plugin

* x

* avoid

* x

* fix cyclic import in docs build

* add support for sharded

* update typing

* add sharded and sharded_spawn to distributed types

* make unwrap model default

* refactor LightningShardedDataParallel similar to LightningDistributedDataParallel

* update sharded spawn to reflect changes

* update sharded to reflect changes

* Merge 1.1.5 changes

* fix merge

* fix merge

* yapf isort

* fix merge

* yapf isort

* fix indentation in test

* copy over reinit scheduler implementation from dev1.2

* fix apex tracking calls with dev_debugger

* reduce diff to dev1.2, clean up

* fix trainer config test  when gpus>0 and num_processes >0 and ddp_cpu

* sort plugin tests legacy/new

* fix error handling for amp on cpu

* fix merge


fix merge


fix merge

* [Feat] Resolve manual_backward (#5837)

* resolve manual_backward

* resolve flake8

* update

* resolve for ddp_spawn

* resolve flake8

* resolve flake8

* resolve flake8

Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>

* fix tests/accelerator tests on cpu

* [BugFix] Resolve manual optimization (#5852)

* resolve manual_optimization

* update

* update

Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>

* Remove copy trainer parameters to happen earlier within the loop and add safe guard to get ref model (#5856)

* resovle a bug

* Accelerator refactor sharded rpc (#5854)

* rpc branch

* merge

* update handling of rpc

* make devices etc. Optional in RPC

* set devices etc. later if necessary

* remove devices from sequential

* make devices optional in rpc

* fix import

* uncomment everything

* fix cluster selection

Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>

* resolve bug

* fix assert in rpc test

* resolve a test

* fix docs compilation

* accelerator refactor - fix for sharded parity test (#5866)

* fix memory issue with ddp_spawn

* x


x


x


x


x


x


x


x


x

* x

* Remove DDP2 as this does not apply

* Add missing pre optimizer hook to ensure lambda closure is called

* fix apex docstring

* [accelerator][BugFix] Resolve some test for 1 gpu (#5863)

* update

* revert init

* resolve a bug

* update

* resolve flake8

* update

* update

* update

* revert init

* resolve a bug

* update

* resolve flake8

* update

* update

* update

* update

* update

* revert init

* resolve a bug

* update

* resolve flake8

* update

* update

* update

* revert init

* update

* resolve flake8

* update

* update

* update

* update

* update

* all_gather

* update

* make plugins work, add misconfig for RPC

* update

* update

* remove breaking test

* resolve some tests

* resolve flake8

* revert to ddp_spawn

Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Justus Schock <justus.schock@rwth-aachen.de>

* yapf isort

* resolve flake8

* fix apex doctests

* fix apex doctests 2

* resolve docs

* update drone

* clean env

* update

* update

* update

* update

* merge

* Fix RPC related tests, clean out old API, update for new accelerator API [skip ci] (#5881)

* Fix RPC related tests, clean out old API, update for new accelerator API

* Move tests out of legacy folder, update paths and names

* Update test_remove_1-4.py

* Expose properties for tpu cores/gpus/num_gpus

* Add root GPU property

* Move properties to properties.py

* move tests that were previously in drone

* Fix root GPU property (#5908)

* Move root GPU to property, remove horovod set as this is handled in horovod plugin, ensure we mock correctly to set GPU accelerator

* Add missing tests back

* fix best model path transfer when no checkpoint callback available

* Fix setup hook order [wip] (#5858)

* Call trainer setup hook before accelerator setup

* Add test case

* add new test

* typo

* fix callback order in test

Co-authored-by: tchaton <thomas@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* rename ddp sequential -> rpc sequential for special test

* revert

* fix stupid merge problem

* Use property in connector for sampler (#5913)

* merge the import conflicts

* fix spawning of processes in slurm

* [wip] Fix some bugs for TPU [skip ci] (#5878)

* fixed for single tpu

* fixed spawn

* fixed spawn

* update

* update

* wip

* resolve bugs

* resolve bug

* update on comment

* removed decorator

* resolve comments

* set to 4

* update

* update

* need cleaning

* update

* update

* update

* resolve flake8

* resolve bugs

* exclude broadcast

* resolve bugs

* change test

* update

* update

* skip if meet fails

* properly raise trace

* update

* add catch

* wrap test

* resolve typo

* update

* typo

Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>

* resolve some tests

* update

* fix imports

* update

* resolve flake8

* update azure pipeline

* skip a sharded test on cpu that requires a gpu

* resolve tpus

* resolve bug

* resolve flake8

* update

* updat utils

* revert permission change on files

* suggestions from carlos

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* remove unrelated formatting changes

* remove incomplete comment

* Update pytorch_lightning/accelerators/__init__.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* remove unrelated formatting change

* add types

* warn 1.7 ddp manual backward only if ddp kwarg unset

* yapf + isort

* pep8 unused imports

* fix cyclic import in docs

* Apply suggestions from code review

* typer in accelerator.py

* typo

* Apply suggestions from code review

* formatting

* update on comments

* update typo

* Update pytorch_lightning/trainer/properties.py

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* update

* suggestion from code review

* suggestion from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Jirka Borovec <jirka.borovec@seznam.cz>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Ubuntu <ubuntu@ip-172-31-88-60.ec2.internal>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: root <root@ip-172-31-88-60.ec2.internal>
Co-authored-by: Lezwon Castelino <lezwon@gmail.com>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 .gitignore                                    |   4 +-
 benchmarks/test_sharded_parity.py             |  33 +--
 dockers/tpu-tests/tpu_test_cases.jsonnet      |   2 +-
 docs/source/advanced/amp.rst                  |   6 +-
 docs/source/common/trainer.rst                |   4 +-
 pytorch_lightning/accelerators/__init__.py    |  17 +-
 pytorch_lightning/accelerators/accelerator.py | 110 ++++----
 .../accelerators/accelerator_connector.py     | 244 +++++++++++++-----
 pytorch_lightning/accelerators/gpu.py         |  16 +-
 .../accelerators/legacy/tpu_accelerator.py    |  25 --
 pytorch_lightning/accelerators/tpu.py         |  24 ++
 pytorch_lightning/callbacks/early_stopping.py |   3 +-
 .../callbacks/model_checkpoint.py             |  11 +-
 pytorch_lightning/core/lightning.py           |   6 +-
 pytorch_lightning/core/memory.py              |   4 +-
 pytorch_lightning/core/optimizer.py           |  31 +--
 pytorch_lightning/core/step_result.py         |   3 +
 pytorch_lightning/loggers/wandb.py            |   2 +-
 pytorch_lightning/overrides/base.py           |   8 +-
 pytorch_lightning/overrides/fairscale.py      |  30 +--
 pytorch_lightning/plugins/__init__.py         |  11 +
 pytorch_lightning/plugins/base_plugin.py      |  17 +-
 .../environments/cluster_environment.py       |   7 +-
 .../plugins/environments/slurm_environment.py |   7 +-
 .../environments/torchelastic_environment.py  |   3 +
 .../plugins/precision/apex_amp.py             |  40 ++-
 .../plugins/precision/native_amp.py           |  48 ++--
 .../plugins/precision/precision_plugin.py     |  20 +-
 .../plugins/precision/tpu_bfloat.py           |   2 +-
 .../plugins/training_type/__init__.py         |   2 +
 .../plugins/training_type/ddp.py              |  33 ++-
 .../plugins/training_type/ddp_spawn.py        |  60 ++++-
 pytorch_lightning/plugins/training_type/dp.py |  14 +
 .../plugins/training_type/horovod.py          |   6 +-
 .../plugins/training_type/parallel.py         |  28 +-
 .../plugins/training_type/rpc.py              |  64 +----
 .../plugins/training_type/rpc_sequential.py   | 125 +++++----
 .../plugins/training_type/sharded.py          |  15 +-
 .../plugins/training_type/sharded_spawn.py    |  19 +-
 .../plugins/training_type/single_tpu.py       |  36 ++-
 .../plugins/training_type/tpu_spawn.py        | 101 ++++++--
 .../training_type/training_type_plugin.py     |  55 ++--
 pytorch_lightning/trainer/callback_hook.py    |  14 +-
 .../connectors/checkpoint_connector.py        |   8 +-
 .../logger_connector/logger_connector.py      |   7 +-
 .../logger_connector/metrics_holder.py        |   3 +-
 .../trainer/connectors/model_connector.py     |   6 +-
 .../trainer/connectors/slurm_connector.py     |  98 -------
 pytorch_lightning/trainer/data_loading.py     |  10 +-
 pytorch_lightning/trainer/deprecated_api.py   |  36 +--
 pytorch_lightning/trainer/optimizers.py       |  20 --
 pytorch_lightning/trainer/properties.py       | 177 +++++++++++--
 pytorch_lightning/trainer/trainer.py          | 179 +++++++------
 pytorch_lightning/trainer/training_loop.py    |  67 ++---
 pytorch_lightning/utilities/__init__.py       |   1 +
 pytorch_lightning/utilities/device_parser.py  |   8 +-
 pytorch_lightning/utilities/enums.py          |   3 +
 pytorch_lightning/utilities/imports.py        |   2 +-
 .../legacy/test_accelerator_connector.py      | 225 +++++++---------
 tests/accelerators/legacy/test_ddp_spawn.py   |   1 -
 .../legacy/test_multi_nodes_gpu.py            |   3 +-
 tests/accelerators/legacy/test_tpu_backend.py |   6 +-
 tests/callbacks/test_callbacks.py             |   4 +-
 tests/callbacks/test_finetuning_callback.py   |  29 +++
 tests/checkpointing/test_model_checkpoint.py  |   1 -
 tests/conftest.py                             |  13 +-
 tests/core/test_datamodules.py                |  40 +--
 tests/core/test_lightning_module.py           |   8 +-
 tests/core/test_lightning_optimizer.py        |   3 +-
 tests/core/test_memory.py                     |   7 +-
 tests/deprecated_api/test_remove_1-4.py       |  33 ++-
 tests/helpers/pipelines.py                    |  12 +-
 tests/helpers/utils.py                        |  13 +-
 tests/models/test_amp.py                      |  26 +-
 tests/models/test_gpu.py                      |   9 +-
 tests/models/test_hooks.py                    |  10 +-
 tests/models/test_horovod.py                  |   8 +-
 tests/models/test_sync_batchnorm.py           |  18 +-
 tests/models/test_tpu.py                      |  51 ++--
 tests/plugins/legacy/__init__.py              |   1 -
 tests/plugins/legacy/test_ddp_plugin.py       | 235 -----------------
 tests/plugins/legacy/test_plugin.py           | 130 ----------
 .../plugins/legacy/test_plugin_properties.py  |  29 ---
 tests/plugins/{legacy => }/test_amp_plugin.py |  42 +--
 .../plugins/{legacy => }/test_apex_plugin.py  |  10 +-
 tests/plugins/{legacy => }/test_rpc_plugin.py |  43 +--
 ...lugin.py => test_rpc_sequential_plugin.py} |  53 ++--
 .../{legacy => }/test_sharded_plugin.py       | 119 +++------
 tests/special_tests.sh                        |  11 +-
 .../optimization/test_manual_optimization.py  | 175 +++++++------
 tests/trainer/test_dataloaders.py             |   2 +-
 tests/trainer/test_trainer.py                 |  58 ++++-
 92 files changed, 1685 insertions(+), 1678 deletions(-)
 delete mode 100644 tests/plugins/legacy/__init__.py
 delete mode 100644 tests/plugins/legacy/test_ddp_plugin.py
 delete mode 100644 tests/plugins/legacy/test_plugin.py
 delete mode 100644 tests/plugins/legacy/test_plugin_properties.py
 rename tests/plugins/{legacy => }/test_amp_plugin.py (80%)
 rename tests/plugins/{legacy => }/test_apex_plugin.py (87%)
 rename tests/plugins/{legacy => }/test_rpc_plugin.py (58%)
 rename tests/plugins/{legacy/test_ddp_sequential_plugin.py => test_rpc_sequential_plugin.py} (82%)
 rename tests/plugins/{legacy => }/test_sharded_plugin.py (71%)

diff --git a/.gitignore b/.gitignore
index e25ca447d763d..9fcf0e1e296df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,6 @@ wandb
 
 # dataset generated from bolts in examples.
 cifar-10-batches-py
-
+*.pt
 # ctags
-tags
+tags
\ No newline at end of file
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 92a5c79088018..f0476ffb7e155 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -15,14 +15,13 @@
 import os
 import platform
 import time
-from typing import Type, Union
+from typing import Type
 
 import pytest
 import torch
 
 from pytorch_lightning import seed_everything, Trainer
-from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.legacy.sharded_plugin import DDPShardedPlugin
+from pytorch_lightning.plugins import DDPSpawnShardedPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from tests.accelerators.legacy import DDPLauncher
 from tests.helpers.boring_model import BoringModel, RandomDataset
@@ -34,8 +33,6 @@
 def test_ddp_sharded_plugin_correctness_one_gpu():
     plugin_parity_test(
         gpus=1,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -48,8 +45,6 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
     plugin_parity_test(
         gpus=1,
         precision=16,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -61,8 +56,6 @@ def test_ddp_sharded_plugin_correctness_amp_one_gpu():
 def test_ddp_sharded_plugin_correctness_multi_gpu():
     plugin_parity_test(
         gpus=2,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -76,8 +69,6 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu():
     plugin_parity_test(
         gpus=2,
         precision=16,
-        accelerator='ddp_spawn',
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -91,8 +82,6 @@ def test_ddp_string_sharded_plugin_correctness_amp_multi_gpu():
     plugin_parity_test(
         gpus=2,
         precision=16,
-        accelerator='ddp_spawn',
-        plugin='ddp_sharded',
         model_cls=SeedTrainLoaderModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -108,8 +97,6 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_ddp(tmpdir, args=None):
     plugin_parity_test(
         gpus=args.gpus,
         precision=args.precision,
-        accelerator=args.accelerator,
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -124,8 +111,6 @@ def test_ddp_sharded_plugin_correctness_amp_multi_gpu_ddp(tmpdir, args=None):
     plugin_parity_test(
         gpus=args.gpus,
         precision=args.precision,
-        accelerator=args.accelerator,
-        plugin=DDPShardedPlugin(),
         model_cls=SeedTrainLoaderModel,
     )
 
@@ -139,9 +124,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim():
         Ensures same results using multiple optimizers across multiple GPUs
     """
     plugin_parity_test(
-        plugin=DDPShardedPlugin(),
         gpus=2,
-        accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderMultipleOptimizersModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -156,9 +139,7 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
         Ensures using multiple optimizers across multiple GPUs with manual optimization
     """
     plugin_parity_test(
-        plugin=DDPShardedPlugin(),
         gpus=2,
-        accelerator='ddp_spawn',
         model_cls=SeedTrainLoaderManualModel,
         max_percent_speed_diff=0.25,  # todo: Increase speed diff since only 2 GPUs sharding 2 optimizers
     )
@@ -260,9 +241,7 @@ def record_ddp_fit_model_stats(trainer, model, use_cuda):
 
 def plugin_parity_test(
     model_cls: Type[SeedTrainLoaderModel],
-    plugin: Union[str, DDPPlugin],
     seed: int = 42,
-    accelerator: str = 'ddp_spawn',
     gpus: int = 0,
     precision: int = 32,
     max_percent_speed_diff: float = 0.1,
@@ -273,9 +252,7 @@ def plugin_parity_test(
 
     Args:
         model_cls: Model class to use for test.
-        plugin: Plugin to parity test.
         seed: Seed for generators. Note that this does not handle the seed for data-loading on multi-process.
-        accelerator: Accelerator type for test.
         gpus: Number of GPUS to enable.
         precision: Whether to use AMP or normal FP32 training.
         max_percent_speed_diff: The maximum speed difference compared to normal DDP training.
@@ -293,7 +270,7 @@ def plugin_parity_test(
         max_epochs=1,
         gpus=gpus,
         precision=precision,
-        accelerator=accelerator,
+        accelerator='ddp_spawn',
     )
 
     max_memory_ddp, ddp_time = record_ddp_fit_model_stats(trainer=trainer, model=ddp_model, use_cuda=use_cuda)
@@ -307,9 +284,9 @@ def plugin_parity_test(
         max_epochs=1,
         gpus=gpus,
         precision=precision,
-        accelerator=accelerator,
-        plugins=[plugin],
+        accelerator='ddp_sharded_spawn',
     )
+    assert isinstance(trainer.training_type_plugin, DDPSpawnShardedPlugin)
 
     max_memory_custom, custom_model_time = record_ddp_fit_model_stats(
         trainer=trainer, model=custom_plugin_model, use_cuda=use_cuda
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
index f9976134df0dc..03cd3b7b65517 100644
--- a/dockers/tpu-tests/tpu_test_cases.jsonnet
+++ b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -21,7 +21,7 @@ local tputests = base.BaseTest {
   command: utils.scriptCommand(
     |||
       cd pytorch-lightning
-      coverage run --source=pytorch_lightning -m pytest -v \
+      coverage run --source=pytorch_lightning -m pytest -v --capture=no \
           pytorch_lightning/utilities/xla_device_utils.py \
           tests/accelerators/legacy/test_tpu_backend.py \
           tests/models/test_tpu.py
diff --git a/docs/source/advanced/amp.rst b/docs/source/advanced/amp.rst
index a0a8758fddeaf..d42f1c8c2928d 100644
--- a/docs/source/advanced/amp.rst
+++ b/docs/source/advanced/amp.rst
@@ -31,10 +31,10 @@ Native torch
 When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.
 
 .. testcode::
-    :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE
+    :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available()
 
     # turn on 16-bit
-    trainer = Trainer(precision=16)
+    trainer = Trainer(precision=16, gpus=1)
 
 Apex 16-bit
 ^^^^^^^^^^^
@@ -73,7 +73,7 @@ Enable 16-bit
 ^^^^^^^^^^^^^
 
 .. testcode::
-    :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE
+    :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available()
 
     # turn on 16-bit
     trainer = Trainer(amp_level='O2', precision=16)
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index 5e573279112a7..e759262ed8ba4 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -1178,13 +1178,13 @@ If used on TPU will use torch.bfloat16 but tensor printing
 will still show torch.float32.
 
 .. testcode::
-    :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE
+    :skipif: not _APEX_AVAILABLE and not _NATIVE_AMP_AVAILABLE or not torch.cuda.is_available()
 
     # default used by the Trainer
     trainer = Trainer(precision=32)
 
     # 16-bit precision
-    trainer = Trainer(precision=16)
+    trainer = Trainer(precision=16, gpus=1)
 
 Example::
 
diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py
index a97edb21e504d..05e15fe1f1767 100644
--- a/pytorch_lightning/accelerators/__init__.py
+++ b/pytorch_lightning/accelerators/__init__.py
@@ -1,5 +1,4 @@
 # Copyright The PyTorch Lightning team.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,15 +10,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pytorch_lightning.accelerators.legacy.accelerator import Accelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.cpu_accelerator import CPUAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.ddp2_accelerator import DDP2Accelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.ddp_accelerator import DDPAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.ddp_cpu_spawn_accelerator import DDPCPUSpawnAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.ddp_hpc_accelerator import DDPHPCAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.ddp_spawn_accelerator import DDPSpawnAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.dp_accelerator import DataParallelAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.horovod_accelerator import HorovodAccelerator  # noqa: F401
-from pytorch_lightning.accelerators.legacy.tpu_accelerator import TPUAccelerator  # noqa: F401
+from pytorch_lightning.accelerators.accelerator import Accelerator  # noqa F401
+from pytorch_lightning.accelerators.cpu import CPUAccelerator  # noqa F401
+from pytorch_lightning.accelerators.gpu import GPUAccelerator  # noqa F401
+from pytorch_lightning.accelerators.tpu import TPUAccelerator  # noqa F401
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index e26dc8b476ab2..e348a57b5c103 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Iterable, Optional, Union
+from typing import Any, Callable, Iterable, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch.optim import Optimizer
+from torch.utils.data import DataLoader
 
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.precision import (
@@ -26,6 +27,7 @@
 from pytorch_lightning.plugins.training_type import TrainingTypePlugin
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.utilities.apply_func import move_data_to_device
+from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available
 from pytorch_lightning.utilities.enums import AMPType, LightningEnum
 
 
@@ -71,7 +73,7 @@ def setup(self, trainer: "Trainer", model: LightningModule) -> None:
             model: the model to train
         """
         self.connect_training_type_plugin(self.training_type_plugin, model)
-        self.setup_optimizers(trainer, model)
+        self.setup_optimizers(trainer)
         self.connect_precision_plugin(self.precision_plugin)
 
     @property
@@ -142,6 +144,9 @@ def training_step(self, args):
             with self.training_type_plugin.train_step_context():
                 return self.training_type_plugin.training_step(*args)
 
+    def post_training_step(self):
+        self.training_type_plugin.post_training_step()
+
     def validation_step(self, args):
         """The actual validation step.
 
@@ -186,7 +191,7 @@ def training_step_end(self, output):
         Args:
             output: the output of the training step
         """
-        return output
+        return self.training_type_plugin.training_step_end(output)
 
     def test_step_end(self, output):
         """A hook to do something at the end of the test step
@@ -194,7 +199,7 @@ def test_step_end(self, output):
         Args:
             output: the output of the test step
         """
-        return output
+        return self.training_type_plugin.test_step_end(output)
 
     def validation_step_end(self, output):
         """A hook to do something at the end of the validation step
@@ -202,11 +207,26 @@ def validation_step_end(self, output):
         Args:
             output: the output of the validation step
         """
-        return output
+        return self.training_type_plugin.validation_step_end(output)
+
+    def predict(self, args):
+        """The prediction step.
+
+        Args:
+            args: the arguments for the models predict step. Can consist of the following:
+                batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
+                    The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
+                batch_idx (int): Integer displaying index of this batch
+                optimizer_idx (int): When using multiple optimizers, this argument will also be present.
+                hiddens(:class:`~torch.Tensor`): Passed in if
+                    :paramref:`~pytorch_lightning.trainer.trainer.Trainer.truncated_bptt_steps` > 0.
 
-    def process_dataloader(
-        self, dataloader: Union[Iterable, torch.utils.data.DataLoader]
-    ) -> Union[Iterable, torch.utils.data.DataLoader]:
+        """
+        batch = self.to_device(args[0])
+        args[0] = batch
+        return self.training_type_plugin.predict(*args)
+
+    def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]:
         """Wraps the dataloader if necessary
 
         Args:
@@ -217,7 +237,7 @@ def process_dataloader(
     def backward(
         self,
         closure_loss: torch.Tensor,
-        optimizer: torch.optim.Optimizer,
+        optimizer: Optimizer,
         opt_idx: int,
         should_accumulate: bool,
         *args,
@@ -231,67 +251,42 @@ def backward(
             opt_idx: the index of the optimizer
             should_accumulate: whether to accumulate gradients
         """
+        self.training_type_plugin.pre_backward(closure_loss, should_accumulate, optimizer, opt_idx)
+
         output = self.precision_plugin.backward(
             self.lightning_module, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
         )
 
-        # TODO: this is a hack, find a better solution for this (hook?)
-        if isinstance(self.training_type_plugin, HorovodPlugin):
-            optimizer.synchronize()
+        self.training_type_plugin.post_backward(closure_loss, should_accumulate, optimizer, opt_idx)
 
         return output
 
-    def optimizer_step(
-        self,
-        optimizer: torch.optim.Optimizer,
-        current_epoch: int,
-        batch_idx: int,
-        opt_idx: int,
-        lambda_closure: Callable,
-    ):
+    def optimizer_step(self, optimizer: Optimizer, opt_idx: int, lambda_closure: Callable, **kwargs):
         """performs the actual optimizer step.
 
         Args:
             optimizer: the optimizer performing the step
-            current_epoch: current training epoch
-            batch_idx: index of the current batch
             opt_idx: index of the current optimizer
             lambda_closure: closure calculating the loss value
 
         """
-        model_ref = self.lightning_module
-        is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
-        native_amp = (
-            isinstance(self.precision_plugin, MixedPrecisionPlugin) and self.precision_plugin.backend == AMPType.NATIVE
-        )
-
-        self.precision_plugin.pre_optimizer_step(optimizer, opt_idx)
-        self.training_type_plugin.pre_optimizer_step(optimizer, opt_idx)
-
-        # model hook
-        res = model_ref.optimizer_step(
-            epoch=current_epoch,
-            batch_idx=batch_idx,
-            optimizer=optimizer,
-            optimizer_idx=opt_idx,
-            optimizer_closure=lambda_closure,
-            on_tpu=False,  # TPUAccelerator class sets this as True
-            using_native_amp=native_amp,
-            using_lbfgs=is_lbfgs,
+        make_optimizer_step = self.precision_plugin.pre_optimizer_step(
+            self.lightning_module, optimizer, opt_idx, lambda_closure, **kwargs
         )
-
+        if make_optimizer_step:
+            self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
         self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
-        self.training_type_plugin.post_optimizer_step(optimizer, opt_idx)
-        return res
+        self.training_type_plugin.post_optimizer_step(optimizer, opt_idx, **kwargs)
 
-    def optimizer_zero_grad(
-        self, current_epoch: int, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int
-    ) -> None:
+    def run_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs):
+        optimizer.step(closure=lambda_closure, **kwargs)
+
+    def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:
         """Zeros all model parameter's gradients"""
         model_ref = self.lightning_module
         model_ref.optimizer_zero_grad(current_epoch, batch_idx, optimizer, opt_idx)
 
-    def clip_gradients(self, optimizer: torch.optim.Optimizer, clip_val: Union[int, float]) -> None:
+    def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float]) -> None:
         """clips all the optimizer parameters to the given value"""
 
         self.precision_plugin.clip_gradients(optimizer, clip_val)
@@ -308,7 +303,7 @@ def on_train_end(self) -> None:
         """Hook to do something at the end of the training"""
         pass
 
-    def setup_optimizers(self, trainer: "Trainer", model: LightningModule):
+    def setup_optimizers(self, trainer: "Trainer"):
         """creates optimizers and schedulers
 
         Args:
@@ -317,7 +312,7 @@ def setup_optimizers(self, trainer: "Trainer", model: LightningModule):
         """
         if trainer.testing is True:
             return
-        optimizers, lr_schedulers, optimizer_frequencies = trainer.init_optimizers(model)
+        optimizers, lr_schedulers, optimizer_frequencies = trainer.init_optimizers(self.lightning_module)
         self.optimizers = optimizers
         self.lr_schedulers = lr_schedulers
         self.optimizer_frequencies = optimizer_frequencies
@@ -374,3 +369,18 @@ def optimizer_state(self, optimizer: Optimizer) -> dict:
 
     def on_save(self, checkpoint):
         return checkpoint
+
+    def barrier(self, name: Optional[str] = None) -> None:
+        self.training_type_plugin.barrier(name=name)
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads)
diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
index b6c60bb1a7eee..cfa9545ad6aee 100644
--- a/pytorch_lightning/accelerators/accelerator_connector.py
+++ b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+from typing import List, Optional, Sequence, Union
 
 import torch
 
@@ -26,7 +27,9 @@
     DataParallelPlugin,
     DDP2Plugin,
     DDPPlugin,
+    DDPShardedPlugin,
     DDPSpawnPlugin,
+    DDPSpawnShardedPlugin,
     HorovodPlugin,
     NativeMixedPrecisionPlugin,
     PrecisionPlugin,
@@ -35,8 +38,9 @@
     SingleTPUPlugin,
     TPUHalfPrecisionPlugin,
     TPUSpawnPlugin,
+    TrainingTypePlugin,
 )
-from pytorch_lightning.plugins.environments import SLURMEnvironment, TorchElasticEnvironment
+from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
@@ -73,7 +77,7 @@ def __init__(
         precision,
         amp_type,
         amp_level,
-        cluster_environment,
+        plugins,
     ):
         # initialization
         self._device_type = DeviceType.CPU
@@ -92,9 +96,12 @@ def __init__(
         self.precision = precision
         self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
         self.amp_level = amp_level
-        self.cluster_environment = cluster_environment
         self.is_slurm_managing_tasks = False
 
+        self._precision_plugin: Optional[PrecisionPlugin] = None
+        self._training_type_plugin: Optional[TrainingTypePlugin] = None
+        self._cluster_environment: Optional[ClusterEnvironment] = None
+
         # init the default rank if exists
         # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
         # this way we only show it on rank 0
@@ -106,26 +113,23 @@ def __init__(
             self.gpus = pick_multiple_gpus(gpus)
 
         self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
-        self.root_gpu = device_parser.determine_root_gpu_device(self.parallel_device_ids)
 
         self.set_distributed_mode()
         self.configure_slurm_ddp()
 
+        self.handle_given_plugins(plugins)
+
         self.accelerator = self.select_accelerator()
 
         # override dist backend when using tpus
         if self.on_tpu:
             self.distributed_backend = "tpu"
-            self.use_tpu = True
 
         # init flags for SLURM+DDP to work
         self.world_size = 1
         self.interactive_ddp_procs = []
         self.global_rank = 0
 
-        # NVIDIA setup
-        # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)
-
         # benchmarking
         # TODO: should this be moved to GPU accelerator?
         torch.backends.cudnn.benchmark = self.benchmark
@@ -138,47 +142,125 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
-        # TODO: move this to TPU accelerator/plugin
-        self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")
-
         self.replace_sampler_ddp = replace_sampler_ddp
 
+    def handle_given_plugins(self, plugins: Optional[Sequence]):
+        plugins = plugins if plugins is not None else []
+
+        if isinstance(plugins, str):
+            plugins = [plugins]
+
+        if not isinstance(plugins, Sequence):
+            plugins = [plugins]
+
+        training_type = None
+        precision = None
+        cluster_environment = None
+
+        for plug in plugins:
+            if isinstance(plug, str):
+                self.set_distributed_mode(plug)
+
+            elif isinstance(plug, TrainingTypePlugin):
+                if training_type is None:
+                    training_type = plug
+
+                else:
+                    raise MisconfigurationException(
+                        'You can only specify one precision and one training type plugin.'
+                        f' Found more than 1 training type plugin: {type(plug).__name__}'
+                    )
+            elif isinstance(plug, PrecisionPlugin):
+                if precision is None:
+                    precision = plug
+                else:
+                    raise MisconfigurationException(
+                        'You can only specify one precision and one training type plugin.'
+                        f' Found more than 1 precision plugin: {type(plug).__name__}'
+                    )
+
+            elif isinstance(plug, ClusterEnvironment):
+                if cluster_environment is None:
+                    cluster_environment = plug
+                else:
+                    raise MisconfigurationException(
+                        'You can only specify one cluster environment. Found more than 1 cluster environment plugin'
+                    )
+            else:
+                raise MisconfigurationException(
+                    f'Found invalid type for plugin {plug}. Expected a precision or training type plugin.'
+                )
+
+        self._training_type_plugin = training_type
+        self._training_type_plugin = self.training_type_plugin
+        self._precision_plugin = precision
+        self._cluster_environment = cluster_environment or self.select_cluster_environment()
+
+    @property
+    def precision_plugin(self) -> PrecisionPlugin:
+        if self._precision_plugin is None:
+            self._precision_plugin = self.select_precision_plugin()
+        return self._precision_plugin
+
+    @property
+    def training_type_plugin(self) -> TrainingTypePlugin:
+        if self._training_type_plugin is None:
+            self._training_type_plugin = self.select_training_type_plugin()
+        else:
+            self._training_type_plugin = self.resolve_training_type_plugin(self._training_type_plugin)
+
+        return self._training_type_plugin
+
+    @property
+    def cluster_environment(self) -> ClusterEnvironment:
+        return self._cluster_environment
+
     @property
-    def on_cpu(self):
+    def on_cpu(self) -> bool:
         return self._device_type == DeviceType.CPU
 
     @property
-    def on_tpu(self):
+    def on_tpu(self) -> bool:
         return self.tpu_cores is not None
 
     @property
-    def tpu_id(self):
-        if self.on_tpu:
+    def tpu_id(self) -> Optional[int]:
+        if self.on_tpu and isinstance(self.tpu_cores, list):
             return self.tpu_cores[0]
 
         return None
 
     @property
-    def on_gpu(self):
+    def on_gpu(self) -> bool:
         gpus = self.parallel_device_ids
         return gpus is not None and len(gpus) > 0 and torch.cuda.is_available()
 
     @property
-    def use_dp(self):
+    def use_dp(self) -> bool:
         return self._distrib_type == DistributedType.DP
 
     @property
-    def use_ddp(self):
-        return self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+    def use_ddp(self) -> bool:
+        return self._distrib_type in (
+            DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP_SHARDED,
+            DistributedType.DDP_SHARDED_SPAWN
+        )
 
     @property
-    def use_ddp2(self):
+    def use_ddp2(self) -> bool:
         return self._distrib_type == DistributedType.DDP2
 
     @property
-    def use_horovod(self):
+    def use_horovod(self) -> bool:
         return self._distrib_type == DistributedType.HOROVOD
 
+    @property
+    def is_distributed(self) -> bool:
+        is_distributed = self.use_ddp or self.use_ddp2 or self.use_horovod
+        if self.on_tpu:
+            is_distributed |= self.training_type_plugin.is_distributed
+        return is_distributed
+
     @property
     def num_gpus(self) -> int:
         gpus = self.parallel_device_ids
@@ -187,7 +269,7 @@ def num_gpus(self) -> int:
         return len(gpus)
 
     @property
-    def parallel_devices(self):
+    def parallel_devices(self) -> Union[List[torch.device], int]:
         if self.on_gpu:
             devices = [torch.device("cuda", i) for i in self.parallel_device_ids]
         elif self.on_tpu:
@@ -199,11 +281,15 @@ def parallel_devices(self):
         return devices
 
     @property
-    def is_using_torchelastic(self):
+    def root_gpu(self) -> Optional[int]:
+        return self.accelerator.root_device.index if not isinstance(self.accelerator, TPUAccelerator) else None
+
+    @property
+    def is_using_torchelastic(self) -> bool:
         te_flags_passed = "WORLD_SIZE" in os.environ and ("GROUP_RANK" in os.environ or "NODE_RANK" in os.environ)
         return te_flags_passed
 
-    def select_precision_plugin(self):
+    def select_precision_plugin(self) -> PrecisionPlugin:
         if self.precision == 32:
             self.amp_type = None
             return PrecisionPlugin()
@@ -219,10 +305,18 @@ def select_precision_plugin(self):
                         " Consider upgrading with `pip install torch>=1.6`."
                         " We will attempt to use NVIDIA Apex for this session."
                     )
+                    if not _APEX_AVAILABLE and self.on_cpu:
+                        raise MisconfigurationException(
+                            "You have asked for native AMP on CPU, but AMP is only available on GPU."
+                        )
                     self.amp_type = "apex"
+                elif self.on_cpu:
+                    raise MisconfigurationException(
+                        "You have asked for native AMP on CPU, but AMP is only available on GPU."
+                    )
                 else:
                     log.info("Using native 16bit precision.")
-                    if self.distributed_backend == "ddp_sharded" or self.distributed_backend == "ddp_sharded_spawn":
+                    if isinstance(self.training_type_plugin, (DDPShardedPlugin, DDPSpawnShardedPlugin)):
                         return ShardedNativeMixedPrecisionPlugin()
                     self.amp_type = AMPType.NATIVE
                     return NativeMixedPrecisionPlugin()
@@ -234,7 +328,7 @@ def select_precision_plugin(self):
                         " Install apex first using this guide: https://github.com/NVIDIA/apex#linux"
                     )
                 else:
-                    if self.distributed_backend == "ddp_sharded" or self.distributed_backend == "ddp_sharded_spawn":
+                    if isinstance(self.training_type_plugin, (DDPShardedPlugin, DDPSpawnShardedPlugin)):
                         raise MisconfigurationException(
                             "Sharded Plugin is not supported with Apex AMP, "
                             "please using native AMP for 16-bit precision."
@@ -245,10 +339,9 @@ def select_precision_plugin(self):
         else:
             raise NotImplementedError("We only support precisions 32 and 16!")
 
-    def select_training_type_plugin(self):
-        cluster_environment = self.select_cluster_environment()
+    def select_training_type_plugin(self) -> TrainingTypePlugin:
         if self.use_ddp2:
-            plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=cluster_environment)
+            plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
         elif self.use_ddp:
             use_slurm_ddp = self.use_ddp and self.is_slurm_managing_tasks
             use_torchelastic_ddp = self.use_ddp and self.is_using_torchelastic
@@ -256,23 +349,21 @@ def select_training_type_plugin(self):
             use_ddp_cpu_spawn = self.use_ddp and self.on_cpu
             use_ddp_cpu_torch_elastic = use_ddp_cpu_spawn and self.is_using_torchelastic
             use_ddp_cpu_slurm = use_ddp_cpu_spawn and self.is_slurm_managing_tasks
-            # use_ddp_sharded = self.distributed_backend == "ddp_sharded"
-            # use_ddp_sharded_spawn = self.distributed_backend == "ddp_sharded_spawn"
+            use_ddp_sharded = self._distrib_type == DistributedType.DDP_SHARDED
+            use_ddp_sharded_spawn = self._distrib_type == DistributedType.DDP_SHARDED_SPAWN
 
-            if self.on_tpu:
-                ddp_plugin_cls = TPUSpawnPlugin
-
-            # ddp script mode uses the same flags as TE
             # TODO: decouple from TE
+            # ddp script mode uses the same flags as TE
             if os.environ.get("PL_IN_DDP_SUBPROCESS", False):
                 use_torchelastic_ddp = False
 
-            # fixme
-            # if use_ddp_sharded:
-            #     ddp_plugin_cls = DDPShardedPlugin
-            # elif use_ddp_sharded_spawn:
-            #     ddp_plugin_cls = DDPSpawnShardedPlugin
-            if use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
+            if self.on_tpu:
+                ddp_plugin_cls = TPUSpawnPlugin
+            elif use_ddp_sharded:
+                ddp_plugin_cls = DDPShardedPlugin
+            elif use_ddp_sharded_spawn:
+                ddp_plugin_cls = DDPSpawnShardedPlugin
+            elif use_ddp_cpu_slurm or use_slurm_ddp or use_ddp_cpu_torch_elastic or use_torchelastic_ddp:
                 ddp_plugin_cls = DDPPlugin
             elif use_ddp_spawn or use_ddp_cpu_spawn:
                 ddp_plugin_cls = DDPSpawnPlugin
@@ -282,7 +373,7 @@ def select_training_type_plugin(self):
             plugin = ddp_plugin_cls(
                 parallel_devices=self.parallel_devices,
                 num_nodes=self.num_nodes,
-                cluster_environment=cluster_environment,
+                cluster_environment=self.cluster_environment,
                 sync_batchnorm=self.sync_batchnorm,
             )
         elif self.use_dp:
@@ -290,14 +381,39 @@ def select_training_type_plugin(self):
         elif self.use_horovod:
             plugin = HorovodPlugin(parallel_devices=self.parallel_devices)
         elif self.on_tpu:
-            plugin = SingleTPUPlugin(self.tpu_id)
+            if isinstance(self.tpu_cores, list):
+                plugin = SingleTPUPlugin(self.tpu_id)
+            else:
+                plugin = TPUSpawnPlugin(parallel_devices=list(range(self.tpu_cores)))
         else:
-            plugin = SingleDevicePlugin(device=torch.device(f"cuda:{self.root_gpu}" if self.on_gpu else "cpu"))
+            single_gpu_ordinal = device_parser.determine_root_gpu_device(self.parallel_device_ids)
+            plugin = SingleDevicePlugin(device=torch.device(f"cuda:{single_gpu_ordinal}" if self.on_gpu else "cpu"))
         return plugin
 
-    def select_accelerator(self):
+    def resolve_training_type_plugin(self, training_type: TrainingTypePlugin) -> TrainingTypePlugin:
+        # necessary for RPC, when user has to provide balance
+        if hasattr(training_type, 'parallel_devices') and not getattr(training_type, 'parallel_devices'):
+            training_type.parallel_devices = self.parallel_devices
+            if hasattr(training_type, 'num_processes'):
+                training_type.num_processes = len(self.parallel_devices)
+
+        if hasattr(training_type, 'cluster_environment') and getattr(training_type, 'cluster_environment') is None:
+            training_type.cluster_environment = self.select_cluster_environment()
+
+        if hasattr(training_type, 'num_nodes') and getattr(training_type, 'num_nodes') is None:
+            training_type.num_nodes = self.num_nodes
+
+        return training_type
+
+    def select_accelerator(self) -> Accelerator:
         if isinstance(self.distributed_backend, Accelerator):
             # custom accelerator from user
+            if self._precision_plugin is not None or self._training_type_plugin is not None:
+                # plugins also specified by user
+                rank_zero_warn(
+                    'Specified `Precision` and `TrainingType` plugins will be ignored,'
+                    ' since an `Accelerator` instance was provided.'
+                )
             return self.distributed_backend
 
         if self.on_gpu:
@@ -308,26 +424,35 @@ def select_accelerator(self):
             acc_cls = CPUAccelerator
 
         return acc_cls(
-            precision_plugin=self.select_precision_plugin(),
-            training_type_plugin=self.select_training_type_plugin(),
+            precision_plugin=self.precision_plugin,
+            training_type_plugin=self.training_type_plugin,
         )
 
-    def select_cluster_environment(self):
-        if self.cluster_environment is not None:
-            return self.cluster_environment
+    def select_cluster_environment(self) -> ClusterEnvironment:
+        if self._cluster_environment is not None:
+            return self._cluster_environment
         if self.is_slurm_managing_tasks:
             env = SLURMEnvironment()
+            # TODO: decouple DDP from SLURM
+            #   refactor and let generic cluster env hold the information about who spawns the processes
+            os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         elif self.is_using_torchelastic:
             env = TorchElasticEnvironment()
             # TODO: decouple DDP from TE
-            #   maybe introduce a DefaultEnvironment?
+            #   refactor and let generic cluster env hold the information about who spawns the processes
             os.environ["PL_IN_DDP_SUBPROCESS"] = "1"
         else:
             # TODO: maybe introduce a DefaultEnvironment?
             env = TorchElasticEnvironment()
         return env
 
-    def set_distributed_mode(self):
+    def set_distributed_mode(self, distributed_backend: Optional[str] = None):
+
+        if distributed_backend is not None:
+            self.distributed_backend = distributed_backend
+
+        if isinstance(self.distributed_backend, Accelerator):
+            return
 
         if self.distributed_backend is None:
             if self.has_horovodrun():
@@ -344,34 +469,33 @@ def set_distributed_mode(self):
         # special case with DDP on CPUs
         if self.distributed_backend == "ddp_cpu":
             self._distrib_type = DistributedType.DDP
-            self.data_parallel_device_ids = None
             if self.num_gpus > 0:
                 rank_zero_warn(
                     'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
                 )
+                self.parallel_device_ids = None
             if self.num_processes is None:
                 # define the max CPU available
                 self.num_processes = os.cpu_count()
         # special case with TPUs
         elif self.distributed_backend == 'tpu':
             self._device_type = DeviceType.TPU
-        # set all other requested distrib. types adn if it was not set in the
         elif self.distributed_backend and self._distrib_type is None:
             self._distrib_type = DistributedType(self.distributed_backend)
 
         # unless you request explicitly for CPU and some GPU are available use them
         _on_cpu = self.distributed_backend and 'cpu' in self.distributed_backend
-        if (self.num_gpus > 0 and not _on_cpu):
+        if self.num_gpus > 0 and not _on_cpu:
             self._device_type = DeviceType.GPU
 
         _distrib_types = (DistributedType.DP, DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
         # DP and DDP2 cannot run without GPU
-        if (self.num_gpus == 0 and self._distrib_type in _distrib_types):
+        if self.num_gpus == 0 and self._distrib_type in _distrib_types and not _on_cpu:
             rank_zero_warn(
                 'You requested distributed training on GPUs, but none is available, so we set backend to `ddp_cpu`.'
             )
             # todo: in some cases it yield in comarison None and int
-            if ((self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1)):
+            if (self.num_nodes and self.num_nodes > 1) or (self.num_processes and self.num_processes > 1):
                 self._distrib_type = DistributedType.DDP
             else:
                 rank_zero_warn('You are running on single node with no parallelization, so distributed has no effect.')
@@ -384,6 +508,9 @@ def set_distributed_mode(self):
         ):
             self.num_processes = self.num_gpus
 
+        if (self._device_type == DeviceType.GPU and self._distrib_type == DistributedType.DDP2):
+            self.num_processes = self.num_nodes
+
         # Horovod is an extra case...
         if self.distributed_backend == "horovod":
             self._set_horovod_backend()
@@ -412,7 +539,6 @@ def _set_horovod_backend(self):
         if self.on_gpu:
             # Horovod assigns one local GPU per process
             self.parallel_device_ids = list(range(hvd.local_size()))
-            self.root_gpu = hvd.local_rank()
         else:
             self.num_processes = hvd.local_size()
 
@@ -431,7 +557,7 @@ def check_horovod(self):
             )
 
     @staticmethod
-    def has_horovodrun():
+    def has_horovodrun() -> bool:
         """Returns True if running with `horovodrun` using Gloo or OpenMPI."""
         return "OMPI_COMM_WORLD_RANK" in os.environ or "HOROVOD_RANK" in os.environ
 
diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 833d5e1cb2a9a..9ec6ad5cdee75 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -1,17 +1,21 @@
+import logging
+import os
+
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+_log = logging.getLogger(__name__)
+
 
 class GPUAccelerator(Accelerator):
 
     def setup(self, trainer, model):
         if "cuda" not in str(self.root_device):
             raise MisconfigurationException(f"Device should be GPU, got {self.root_device} instead")
+        self.set_nvidia_flags()
         torch.cuda.set_device(self.root_device)
-        model.to(self.root_device)
-
         return super().setup(trainer, model)
 
     def on_train_start(self):
@@ -25,3 +29,11 @@ def on_train_end(self):
         # clean up memory
         with torch.cuda.device(self.root_device):
             torch.cuda.empty_cache()
+
+    @staticmethod
+    def set_nvidia_flags():
+        # set the correct cuda visible devices (using pci order)
+        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
+        devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
+        _log.info(f"LOCAL_RANK: {os.getenv('LOCAL_RANK', 0)} - CUDA_VISIBLE_DEVICES: [{devices}]")
diff --git a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
index 009144bb8431a..71a9edecf4c34 100644
--- a/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/legacy/tpu_accelerator.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import io
 import os
-import re
 from typing import Any, Callable, Optional, Union
 
 import torch
@@ -31,7 +30,6 @@
     rank_zero_only,
     rank_zero_warn,
 )
-from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _TPU_AVAILABLE:
@@ -307,29 +305,6 @@ def load_spawn_weights(self, original_model):
 
         return loaded_model
 
-    def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
-        if self.trainer.distributed_backend not in ("ddp_spawn", "ddp_cpu", "tpu"):
-            return
-
-        # track the best model path
-        best_model_path = None
-        if self.trainer.checkpoint_callback is not None:
-            best_model_path = self.trainer.checkpoint_callback.best_model_path
-
-        if self.trainer.global_rank == 0 and mp_queue is not None:
-            rank_zero_warn('cleaning up ddp environment...')
-            # todo, pass complete checkpoint as state dictionary
-            mp_queue.put(best_model_path)
-            mp_queue.put(results)
-
-            # save the last weights
-            last_path = None
-            if not self.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
-                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
-                state_dict = move_data_to_device(model.state_dict(), torch.device("cpu"))
-                atomic_save(state_dict, last_path)
-            mp_queue.put(last_path)
-
     def broadcast(self, obj, src=0):
         if self.trainer.tpu_id is not None:
             # running on a single core
diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
index 66ed4e5126400..8f63bc7b86b11 100644
--- a/pytorch_lightning/accelerators/tpu.py
+++ b/pytorch_lightning/accelerators/tpu.py
@@ -1,9 +1,18 @@
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch.optim import Optimizer
+
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin
+from pytorch_lightning.utilities import _XLA_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
+if _XLA_AVAILABLE:
+    import torch_xla.core.xla_model as xm
+
 
 class TPUAccelerator(Accelerator):
 
@@ -17,3 +26,18 @@ def setup(self, trainer, model):
         if not isinstance(self.training_type_plugin, (SingleTPUPlugin, TPUSpawnPlugin)):
             raise MisconfigurationException("TPUs only support a single tpu core or tpu spawn training.")
         return super().setup(trainer, model)
+
+    def run_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs):
+        xm.optimizer_step(optimizer, optimizer_args={'closure': lambda_closure, **kwargs})
+
+    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+        """
+        Function to gather a tensor from several distributed processes
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: the process group to gather results from. Defaults to all processes (world)
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        return xm.all_gather(tensor, group=group, sync_grads=sync_grads)
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index d0d7ec3d6e606..7f42af82c48d5 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -175,6 +175,7 @@ def _run_early_stopping_check(self, trainer, pl_module):
         if self.monitor_op(current - self.min_delta, self.best_score):
             self.best_score = current
             self.wait_count = 0
+            should_stop = False
         else:
             self.wait_count += 1
             should_stop = self.wait_count >= self.patience
@@ -184,5 +185,5 @@ def _run_early_stopping_check(self, trainer, pl_module):
                 trainer.should_stop = True
 
         # stop every ddp process if any world process decides to stop
-        should_stop = trainer.accelerator_backend.early_stopping_should_stop(pl_module)
+        should_stop = trainer.training_type_plugin.reduce_early_stopping_decision(should_stop)
         trainer.should_stop = should_stop
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index f55a636deaf3b..e6de1737b3f41 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -439,7 +439,7 @@ def __resolve_ckpt_dir(self, trainer):
                 if isinstance(trainer.logger.version, str) else f"version_{trainer.logger.version}"
             )
 
-            version, name = trainer.accelerator_backend.broadcast((version, trainer.logger.name))
+            version, name = trainer.training_type_plugin.broadcast((version, trainer.logger.name))
 
             ckpt_path = os.path.join(save_dir, str(name), version, "checkpoints")
         else:
@@ -520,11 +520,9 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
                 trainer,
             )
 
-        accelerator_backend = trainer.accelerator_backend
-
-        if accelerator_backend is not None and accelerator_backend.rpc_enabled:
+        if trainer.training_type_plugin.rpc_enabled:
             # RPCPlugin manages saving all model states
-            accelerator_backend.ddp_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module)
+            trainer.training_type_plugin.rpc_save_model(self._save_model, last_filepath, trainer, pl_module)
         else:
             self._save_model(last_filepath, trainer, pl_module)
         if (
@@ -607,6 +605,5 @@ def file_exists(self, filepath: Union[str, Path], trainer) -> bool:
         the internal state to diverge between ranks.
         """
         exists = self._fs.exists(filepath)
-        if trainer.accelerator_backend is not None:
-            exists = trainer.accelerator_backend.broadcast(exists)
+        exists = trainer.training_type_plugin.broadcast(exists)
         return exists
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index e84be73e41acf..59bd10c042018 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -275,7 +275,7 @@ def log(
                     f"Logged key: {name} should not contain information about dataloader_idx."
                 )
 
-            accelerator = self.trainer.accelerator_backend
+            training_type_plugin = self.trainer.training_type_plugin
 
             self._results.log(
                 name,
@@ -291,7 +291,7 @@ def log(
                 sync_dist,
                 sync_dist_op,
                 sync_dist_group,
-                accelerator.sync_tensor,
+                training_type_plugin.reduce,
                 self._current_dataloader_idx,
                 self.device,
             )
@@ -1347,7 +1347,7 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
         """
         if not isinstance(optimizer, LightningOptimizer):
             # wraps into LightingOptimizer only for running step
-            optimizer = LightningOptimizer.to_lightning_optimizer(optimizer, self.trainer)
+            optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, optimizer_idx)
         optimizer.step(closure=optimizer_closure)
 
     def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
diff --git a/pytorch_lightning/core/memory.py b/pytorch_lightning/core/memory.py
index e05feff0db5bf..ce90e21e3528c 100644
--- a/pytorch_lightning/core/memory.py
+++ b/pytorch_lightning/core/memory.py
@@ -183,7 +183,9 @@ def __init__(self, model, mode: str = MODE_DEFAULT):
         self._mode = mode
         self._layer_summary = self.summarize()
         # 1 byte -> 8 bits
-        self._precision_megabytes = (self._model.precision / 8.0) * 1e-6
+        # TODO: how do we compute precisin_megabytes in case of mixed precision?
+        precision = self._model.precision if isinstance(self._model.precision, int) else 32
+        self._precision_megabytes = (precision / 8.0) * 1e-6
 
     @property
     def named_modules(self) -> List[Tuple[str, nn.Module]]:
diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index 77812cf3ef12b..42af0f44e0071 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -17,12 +17,9 @@
 
 from torch.optim.optimizer import Optimizer
 
-from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType
+from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-if _TPU_AVAILABLE:
-    import torch_xla.core.xla_model as xm
-
 
 def is_lightning_optimizer(optimizer):
     return isinstance(optimizer, LightningOptimizer)
@@ -62,6 +59,7 @@ def __init__(self, optimizer: Optimizer, accumulate_grad_batches: Optional[int]
         self._trainer = None
         self._accumulate_grad_batches = accumulate_grad_batches
         self._optimizer_idx = None
+        self._total_optimizer_step_calls = 0
 
     @property
     def optimizer(self):
@@ -128,29 +126,13 @@ def _should_accumulate(self):
         is_final_batch = self._trainer.train_loop._num_training_batches_reached()
         return not (accumulation_done or is_final_batch)
 
-    def __optimizer_step(self, *args, closure: Optional[Callable] = None, profiler_name: str = None, **kwargs):
+    def __optimizer_step(self, closure: Optional[Callable] = None, profiler_name: str = None, **kwargs):
         trainer = self._trainer
         optimizer = self._optimizer
         model = trainer.get_model()
 
-        if trainer._device_type == DeviceType.TPU:
-            with trainer.profiler.profile(profiler_name):
-                xm.optimizer_step(optimizer, optimizer_args={'closure': closure, **kwargs})
-
-        elif trainer.amp_backend is not None:
-            trainer.precision_connector.backend.optimizer_step(trainer, optimizer, closure)
-
-        else:
-            with trainer.profiler.profile(profiler_name):
-                optimizer.step(closure=closure, *args, **kwargs)
-
-        accelerator_backend = trainer.accelerator_backend
-        if accelerator_backend is not None and accelerator_backend.rpc_enabled:
-            if accelerator_backend.ddp_plugin.is_main_rpc_process:
-                # Initialize optimizer step on main process
-                accelerator_backend.ddp_plugin.worker_optimizer_step(
-                    model=model, opt_idx=self._optimizer_idx, *args, **kwargs
-                )
+        with trainer.profiler.profile(profiler_name):
+            trainer.accelerator_backend.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
 
         trainer.train_loop.on_before_zero_grad(optimizer)
 
@@ -277,10 +259,11 @@ def dis_closure():
 
         if make_optimizer_step:
             self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
+            self._total_optimizer_step_calls += 1
         else:
             # make sure to call optimizer_closure when accumulating
             with self._trainer.profiler.profile(f"closure_{self._optimizer_idx}"):
-                with self._trainer.train_loop.block_ddp_sync_behaviour():
+                with self._trainer.train_loop.block_ddp_sync_behaviour(True):
                     closure()
 
     def __repr__(self):
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index c227c039d2bca..974974b032bec 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -148,6 +148,9 @@ def log(
                 value = torch.tensor(value, device=device, dtype=torch.float)
             value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
 
+        if isinstance(value, torch.Tensor) and value.device.type == "xla":
+            value = value.cpu()
+
         if 'meta' not in self:
             self.__setitem__('meta', {})
 
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 63708ff1e5852..b023b363a0b08 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -24,7 +24,7 @@
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
 from pytorch_lightning.utilities import _module_available, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.warning_utils import WarningCache
+from pytorch_lightning.utilities.warnings import WarningCache
 
 _WANDB_AVAILABLE = _module_available("wandb")
 
diff --git a/pytorch_lightning/overrides/base.py b/pytorch_lightning/overrides/base.py
index 3dd20f6d4303b..1a33556991148 100644
--- a/pytorch_lightning/overrides/base.py
+++ b/pytorch_lightning/overrides/base.py
@@ -46,6 +46,13 @@ def forward(self, *inputs, **kwargs):
 
         if running_stage == RunningStage.TRAINING:
             output = self.module.training_step(*inputs, **kwargs)
+
+            # In manual_optimization, we need to prevent DDP reducer as
+            # it is done manually in ``LightningModule.manual_backward``
+            # `require_backward_grad_sync` will be reset in the
+            # ddp_plugin ``post_training_step`` hook
+            if not self.module.automatic_optimization:
+                self.module.trainer.model.require_backward_grad_sync = False
             warn_if_output_is_none(output, "training_step")
         elif running_stage == RunningStage.TESTING:
             output = self.module.test_step(*inputs, **kwargs)
@@ -55,7 +62,6 @@ def forward(self, *inputs, **kwargs):
             warn_if_output_is_none(output, "validation_step")
         else:
             output = self.module.predict(*inputs, **kwargs)
-
         return output
 
 
diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py
index f413065f627ff..f7c3b8d5fd575 100644
--- a/pytorch_lightning/overrides/fairscale.py
+++ b/pytorch_lightning/overrides/fairscale.py
@@ -11,31 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pytorch_lightning.trainer.states import RunningStage
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 
 LightningShardedDataParallel = None
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
 
-    class LightningShardedDataParallel(ShardedDataParallel):
+    class LightningShardedDataParallel(_LightningModuleWrapperBase):
+        # Just do this for later docstrings
+        pass
 
-        def forward(self, *inputs, **kwargs):
-            if self.enable_broadcast_buffers:
-                self.sync_buffers()
+    def unwrap_lightning_module_sharded(wrapped_model) -> LightningModule:
+        model = wrapped_model
+        if isinstance(model, ShardedDataParallel):
+            model = model.module
 
-            running_stage = self.module.running_stage
-
-            if running_stage == RunningStage.TRAINING:
-                outputs = self.module.training_step(*inputs, **kwargs)
-
-            elif running_stage == RunningStage.TESTING:
-                outputs = self.module.test_step(*inputs, **kwargs)
-
-            elif running_stage == RunningStage.EVALUATING:
-                outputs = self.module.validation_step(*inputs, **kwargs)
-
-            else:
-                outputs = self.module.predict(*inputs, **kwargs)
-
-            return outputs
+        return unwrap_lightning_module(model)
diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
index 0990b547907e7..2d9086c2e18ad 100644
--- a/pytorch_lightning/plugins/__init__.py
+++ b/pytorch_lightning/plugins/__init__.py
@@ -10,6 +10,10 @@
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.rpc import RPCPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin  # noqa: F401
@@ -29,4 +33,11 @@
     "SingleTPUPlugin",
     "TPUHalfPrecisionPlugin",
     "TPUSpawnPlugin",
+    'RPCPlugin',
+    'RPCSequentialPlugin',
+    'TrainingTypePlugin',
+    'ParallelPlugin',
+    'Plugin',
+    'DDPShardedPlugin',
+    'DDPSpawnShardedPlugin',
 ]
diff --git a/pytorch_lightning/plugins/base_plugin.py b/pytorch_lightning/plugins/base_plugin.py
index b316a8663f9ff..b8bdf38a57137 100644
--- a/pytorch_lightning/plugins/base_plugin.py
+++ b/pytorch_lightning/plugins/base_plugin.py
@@ -13,27 +13,26 @@
 # limitations under the License.
 import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Generator, Optional, overload, Sequence, Tuple
+from typing import Any, Callable, Generator, Optional, overload, Sequence, Tuple
 
 import torch
+from torch.nn import Module
 
 
 class Plugin(ABC):
     """Basic Plugin class to derive precision and training type plugins from."""
 
     @abstractmethod
-    def connect(self, model: torch.nn.Module, *args: Sequence,
-                **kwargs: Sequence) -> Optional[Tuple[torch.nn.Module, Sequence, Sequence]]:
+    def connect(
+        self,
+        model: Module,
+        *args: Sequence,
+        **kwargs: Sequence,
+    ) -> Optional[Tuple[Module, Sequence, Sequence]]:
         """Connects the plugin with the accelerator (and thereby with trainer and model).
         Will be called by the accelerator.
         """
 
-    def pre_optimizer_step(self, optimizer: torch.optim.Optimizer, optimizer_idx: int) -> None:
-        """Hook to do something before each optimizer step."""
-
-    def post_optimizer_step(self, optimizer: torch.optim.Optimizer, optimizer_idx: int) -> None:
-        """Hook to do something after each optimizer step."""
-
     def pre_training(self) -> None:
         """Hook to do something before the training starts."""
 
diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py
index 2139f5bac0020..41af4fe84c7f0 100644
--- a/pytorch_lightning/plugins/environments/cluster_environment.py
+++ b/pytorch_lightning/plugins/environments/cluster_environment.py
@@ -26,8 +26,11 @@ def master_address(self):
     def master_port(self):
         pass
 
-    def world_size(self):
+    def world_size(self) -> int:
         return self._world_size
 
-    def local_rank(self):
+    def local_rank(self) -> int:
+        pass
+
+    def node_rank(self) -> int:
         pass
diff --git a/pytorch_lightning/plugins/environments/slurm_environment.py b/pytorch_lightning/plugins/environments/slurm_environment.py
index 01c76ad0533e2..59ab27cd4c323 100644
--- a/pytorch_lightning/plugins/environments/slurm_environment.py
+++ b/pytorch_lightning/plugins/environments/slurm_environment.py
@@ -32,7 +32,7 @@ def master_address(self):
         else:
             root_node = "127.0.0.1"
 
-        root_node = self._resolve_root_node_address(root_node)
+        root_node = self.resolve_root_node_address(root_node)
         os.environ["MASTER_ADDR"] = root_node
         log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
         return root_node
@@ -70,7 +70,10 @@ def world_size(self):
     def local_rank(self):
         return int(os.environ['SLURM_LOCALID'])
 
-    def _resolve_root_node_address(self, root_node):
+    def node_rank(self):
+        return int(os.environ['SLURM_NODEID'])
+
+    def resolve_root_node_address(self, root_node):
         if '[' in root_node:
             name, numbers = root_node.split('[', maxsplit=1)
             number = numbers.split(',', maxsplit=1)[0]
diff --git a/pytorch_lightning/plugins/environments/torchelastic_environment.py b/pytorch_lightning/plugins/environments/torchelastic_environment.py
index 5d060e62032dc..bb77760e9dd61 100644
--- a/pytorch_lightning/plugins/environments/torchelastic_environment.py
+++ b/pytorch_lightning/plugins/environments/torchelastic_environment.py
@@ -46,3 +46,6 @@ def world_size(self):
 
     def local_rank(self):
         return int(os.environ['LOCAL_RANK'])
+
+    def node_rank(self) -> int:
+        return int(os.environ.get('GROUP_RANK', 0))
diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py
index b9720f19fe3eb..884b05cfd8de2 100644
--- a/pytorch_lightning/plugins/precision/apex_amp.py
+++ b/pytorch_lightning/plugins/precision/apex_amp.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 from torch.optim import Optimizer
@@ -38,6 +38,8 @@ def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
         """Connects the precision plugin to the training process,
         configures apex and reinits the schedulers
         """
+        if model.device.type != "cuda":
+            return model, optimizers, lr_schedulers
         model, optimizers = self.configure_apex(amp, model, optimizers, self.amp_level)
         self.reinit_scheduler_properties(optimizers, lr_schedulers)
         return model, optimizers, lr_schedulers
@@ -71,7 +73,11 @@ def backward(
         # do backward pass
         # TODO: not entirely sure, why we need this
         if model is not None and isinstance(model, LightningModule):
-            model.backward(closure_loss, optimizer, opt_idx)
+            model.backward(closure_loss, optimizer, opt_idx, **kwargs)
+
+            # TODO: avoid dev_debugger and track these calls with mock
+            model.trainer.dev_debugger.track_event('AMP', str(AMPType.APEX))
+
         else:
             closure_loss.backward(*args, **kwargs)
 
@@ -125,22 +131,34 @@ def reinit_scheduler_properties(optimizers: list, schedulers: list):
         """Reinitializes schedulers with correct properties"""
         # Reinitialize optimizer.step properties added by schedulers
         for scheduler in schedulers:
-            scheduler = scheduler["scheduler"]
+            scheduler = scheduler['scheduler']
+            state = None
 
             for optimizer in optimizers:
-                state = None
-                idx = 0
-
                 # check that we dont mix users optimizers and schedulers
                 if scheduler.optimizer == optimizer:
                     # Find the mro belonging to the base lr scheduler class
                     for i, mro in enumerate(scheduler.__class__.__mro__):
                         if mro in (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
-                            idx = i
                             state = scheduler.state_dict()
-                        else:
-                            state = None
+                            scheduler.__class__.__mro__[i].__init__(scheduler, optimizer)
+                            scheduler.load_state_dict(state)
+                            break
 
-                scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
                 if state is not None:
-                    scheduler.load_state_dict(state)
+                    break
+
+    def pre_optimizer_step(
+        self, pl_module: LightningModule, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs
+    ) -> bool:
+        """
+        always called before the optimizer step.
+        """
+        # apex amp does not support closures.
+        lambda_closure()
+
+        if not pl_module.automatic_optimization:
+            pl_module.trainer.call_hook("on_after_backward")
+            optimizer.step()
+
+        return False
diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
index 8cdaba833af85..e8a6511798664 100644
--- a/pytorch_lightning/plugins/precision/native_amp.py
+++ b/pytorch_lightning/plugins/precision/native_amp.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Generator
+from typing import Callable, Generator
 
 import torch
+from torch.optim import LBFGS, Optimizer
 
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin
@@ -33,25 +34,11 @@ def __init__(self):
         self.backend = AMPType.NATIVE
         self.scaler = torch.cuda.amp.GradScaler()
 
-    def pre_optimizer_step(self, optimizer: torch.optim.Optimizer, optimizer_idx: int) -> None:
-        """always called before the optimizer step.
-        Checks that the optimizer is not LBFGS, as this one is not supported by native amp
-        """
-        if isinstance(optimizer, torch.optim.LBFGS):
-            raise MisconfigurationException(
-                f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
-                " To request, please file a Github issue in PyTorch and tag @mcarilli"
-            )
-
-    def post_optimizer_step(self, optimizer: torch.optim.Optimizer, optimizer_idx: int) -> None:
-        """Updates the GradScaler"""
-        self.scaler.update()
-
     def backward(
         self,
         model: LightningModule,
         closure_loss: torch.Tensor,
-        optimizer: torch.optim.Optimizer,
+        optimizer: Optimizer,
         opt_idx: int,
         should_accumulate: bool,
         *args,
@@ -69,16 +56,39 @@ def backward(
         """
         closure_loss = self.scaler.scale(closure_loss)
 
-        automatic_optimization = model.automatic_optimization
-
         closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
 
         # unscale gradient to allow analyze within `on_after_backward`
-        if not should_accumulate and automatic_optimization:
+        if not should_accumulate and model.automatic_optimization:
             self.scaler.unscale_(optimizer)
 
         return closure_loss
 
+    def pre_optimizer_step(
+        self, pl_module: LightningModule, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs
+    ) -> bool:
+        """always called before the optimizer step.
+        Checks that the optimizer is not LBFGS, as this one is not supported by native amp
+        """
+        if isinstance(optimizer, LBFGS):
+            raise MisconfigurationException(
+                f"native PyTorch amp and lbfgs are not compatible (optimizer {optimizer_idx})."
+                " To request, please file a Github issue in PyTorch and tag @mcarilli"
+            )
+        lambda_closure()
+
+        if not pl_module.automatic_optimization:
+            self.scaler.unscale_(optimizer)
+
+            pl_module.trainer.call_hook("on_after_backward")
+
+        return False
+
+    def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int) -> None:
+        """Updates the GradScaler"""
+        self.scaler.step(optimizer)
+        self.scaler.update()
+
     @contextmanager
     def train_step_context(self) -> Generator[autocast, None, None]:
         """Enable autocast context"""
diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
index 3e74442e92277..2216d3ae46d53 100644
--- a/pytorch_lightning/plugins/precision/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Any, Generator, Sequence, Tuple, Union
+from typing import Any, Callable, Generator, Sequence, Tuple, Union
 
 import torch
+from torch.nn import Module
 from torch.optim import Optimizer
 
 from pytorch_lightning.core import LightningModule
@@ -28,7 +29,7 @@ class PrecisionPlugin(Plugin):
     EPSILON = 1e-6
     precision = 32
 
-    def master_params(self, optimizer: torch.optim.Optimizer) -> Generator[torch.Tensor, None, None]:
+    def master_params(self, optimizer: Optimizer) -> Generator[torch.Tensor, None, None]:
         """The master params of the model. Returns the plain model params here.
         Maybe different in other precision plugins.
 
@@ -37,8 +38,8 @@ def master_params(self, optimizer: torch.optim.Optimizer) -> Generator[torch.Ten
             for p in group["params"]:
                 yield p
 
-    def connect(self, model: torch.nn.Module, optimizers: Sequence,
-                lr_schedulers: Sequence) -> Tuple[torch.nn.Module, Sequence, Sequence]:
+    def connect(self, model: Module, optimizers: Sequence,
+                lr_schedulers: Sequence) -> Tuple[Module, Sequence, Sequence]:
         """Connects this plugin to the accelerator and the training process"""
         return model, optimizers, lr_schedulers
 
@@ -46,7 +47,7 @@ def backward(
         self,
         model: LightningModule,
         closure_loss: torch.Tensor,
-        optimizer: torch.optim.Optimizer,
+        optimizer: Optimizer,
         opt_idx: int,
         should_accumulate: bool,
         *args: Any,
@@ -75,6 +76,15 @@ def backward(
 
         return closure_loss
 
+    def pre_optimizer_step(
+        self, pl_module: LightningModule, optimizer: Optimizer, optimizer_idx: int, closure: Callable, **kwargs
+    ) -> bool:
+        """Hook to do something before each optimizer step."""
+        return True
+
+    def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int) -> None:
+        """Hook to do something after each optimizer step."""
+
     def clip_gradients(self, optimizer: Optimizer, clip_val: Union[int, float], norm_type: float = float(2.0)) -> None:
         """Clips the gradients to a specific value"""
         # TODO: separate TPU case from here
diff --git a/pytorch_lightning/plugins/precision/tpu_bfloat.py b/pytorch_lightning/plugins/precision/tpu_bfloat.py
index 7f4916dd26a46..c911bf69184f6 100644
--- a/pytorch_lightning/plugins/precision/tpu_bfloat.py
+++ b/pytorch_lightning/plugins/precision/tpu_bfloat.py
@@ -25,4 +25,4 @@ class TPUHalfPrecisionPlugin(PrecisionPlugin):
 
     def connect(self, model: torch.nn.Module, optimizers, lr_schedulers):
         os.environ["XLA_USE_BF16"] = str(1)
-        return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers)
+        return super().connect(model=model, optimizers=optimizers, lr_schedulers=lr_schedulers)
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
index 21dec5bc5ccda..a5a644fc6568c 100644
--- a/pytorch_lightning/plugins/training_type/__init__.py
+++ b/pytorch_lightning/plugins/training_type/__init__.py
@@ -4,6 +4,8 @@
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
 from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.plugins.training_type.rpc import RPCPlugin
+from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin
 from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin
 from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index bb906a2268d62..52a24655f0846 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -21,13 +21,16 @@
 import torch
 import torch.distributed as torch_distrib
 from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.optim import Optimizer
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.plugins.environments import SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
-from pytorch_lightning.utilities import _HYDRA_AVAILABLE
+from pytorch_lightning.utilities import _HYDRA_AVAILABLE, _TORCH_GREATER_EQUAL_1_7, rank_zero_warn
 from pytorch_lightning.utilities.distributed import (
     find_free_network_port,
     rank_zero_only,
@@ -70,7 +73,7 @@ def __init__(
         self._has_spawned_children = False
         self.task_idx = None
         self.node_rank = 0
-        self.num_processes = len(parallel_devices)
+        self.num_processes = len(parallel_devices) if parallel_devices is not None else parallel_devices
 
     @property
     def root_device(self):
@@ -85,7 +88,7 @@ def setup(self, model):
         self._model = model
 
         # start the other scripts
-        # TODO: make sure this works, in torchelastic we should not launch child processes!
+        # TODO: refactor and let generic cluster env hold the information about who spawns the processes
         if os.environ.get("PL_IN_DDP_SUBPROCESS", "0") != "1":
             self._call_children_scripts()
 
@@ -177,7 +180,19 @@ def set_world_ranks(self):
         self.global_rank = self.node_rank * self.num_processes + self.local_rank
         self.world_size = self.num_nodes * self.num_processes
 
+    def pre_configure_ddp(self):
+        # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()`` breaking manual_optimization
+        if _TORCH_GREATER_EQUAL_1_7 and not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
+            "find_unused_parameters", False
+        ):
+            rank_zero_warn(
+                "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` "
+                "to properly work with DDP."
+            )
+            self._ddp_kwargs["find_unused_parameters"] = True
+
     def configure_ddp(self):
+        self.pre_configure_ddp()
         self._model = DistributedDataParallel(
             LightningDistributedModule(self.model),
             device_ids=self.determine_ddp_device_ids(),
@@ -253,6 +268,11 @@ def barrier(self, *args, **kwargs):
     def broadcast(self, obj: object, src: int = 0) -> object:
         return self.dist.broadcast(obj)
 
+    def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int):
+        """Run before precision plugin executes backward"""
+        if not self.lightning_module.automatic_optimization and self.model.require_backward_grad_sync:
+            prepare_for_backward(self.model, closure_loss)
+
     def model_to_device(self):
         if self.root_device.type == "cuda":
             torch.cuda.set_device(self.root_device)
@@ -271,3 +291,10 @@ def validation_step(self, *args, **kwargs):
 
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
+
+    def predict(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def post_training_step(self):
+        if not self.lightning_module.automatic_optimization:
+            self.model.require_backward_grad_sync = True
diff --git a/pytorch_lightning/plugins/training_type/ddp_spawn.py b/pytorch_lightning/plugins/training_type/ddp_spawn.py
index 6f251eb36985a..6b6d85ee0d29f 100644
--- a/pytorch_lightning/plugins/training_type/ddp_spawn.py
+++ b/pytorch_lightning/plugins/training_type/ddp_spawn.py
@@ -19,12 +19,15 @@
 import torch.distributed as torch_distrib
 import torch.multiprocessing as mp
 from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.optim import Optimizer
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.distributed.dist import LightningDistributed
 from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.distributed import prepare_for_backward
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
+from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.distributed import (
@@ -58,6 +61,15 @@ def __init__(
         self.node_rank = 0
         self.mp_queue = None
 
+    def __getstate__(self):
+        """ Makes this plugin pickleable without destroying the queue in the current process. """
+        state = self.__dict__.copy()
+        state["mp_queue"] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+
     @property
     def root_device(self):
         return self.parallel_devices[self.local_rank]
@@ -79,18 +91,28 @@ def setup(self, model):
     def set_world_ranks(self, process_idx):
         self.local_rank = process_idx
         self.node_rank = self.cluster_environment.node_rank()
+        self.task_idx = self.cluster_local_rank
         self.global_rank = self.node_rank * self.num_processes + self.local_rank
         self.world_size = self.num_nodes * self.num_processes
 
+    @property
+    def mp_spawn_kwargs(self):
+        return {
+            "args": (self.lightning_module.trainer, self.mp_queue),
+            "nprocs": self.num_processes,
+        }
+
     def start_training(self, trainer):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer, ))
+        mp.spawn(self.new_process, **self.mp_spawn_kwargs)
         # reset optimizers, since main process is never used for training and thus does not have a valid optim state
         trainer.optimizers = []
 
     def start_testing(self, trainer):
-        mp.spawn(self.new_process, nprocs=self.num_processes, args=(trainer, ))
+        mp.spawn(self.new_process, **self.mp_spawn_kwargs)
+
+    def new_process(self, process_idx, trainer, mp_queue):
+        self.mp_queue = mp_queue
 
-    def new_process(self, process_idx, trainer):
         # TODO: check if needed
         seed = os.environ.get("PL_GLOBAL_SEED")
         if seed is not None:
@@ -148,7 +170,19 @@ def post_training(self):
         # recover the weights of the processes trained in the children
         self.__recover_child_process_weights(best_path, last_path)
 
+    def pre_configure_ddp(self):
+        # todo: PyTorch 1.7.0 DDP introduces ``self.reducer._rebuild_buckets()`` breaking manual_optimization
+        if _TORCH_GREATER_EQUAL_1_7 and not self.lightning_module.automatic_optimization and not self._ddp_kwargs.get(
+            "find_unused_parameters", False
+        ):
+            rank_zero_warn(
+                "From PyTorch 1.7.0, Lightning ``manual_optimization`` needs to set ``find_unused_parameters=True`` "
+                "to properly work with DDP."
+            )
+            self._ddp_kwargs["find_unused_parameters"] = True
+
     def configure_ddp(self):
+        self.pre_configure_ddp()
         self._model = DistributedDataParallel(
             LightningDistributedModule(self.model),
             device_ids=self.determine_ddp_device_ids(),
@@ -171,9 +205,13 @@ def determine_ddp_device_ids(self):
             return None
         return [self.root_device.index]
 
+    def on_save(self, checkpoint: dict) -> dict:
+        return checkpoint
+
     def transfer_distrib_spawn_state_on_fit_end(self, results):
         # TODO: is there a better way than accessing callback through model -> trainer -> callback?
-        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
+        checkpoint_callback = self.lightning_module.trainer.checkpoint_callback
+        best_model_path = checkpoint_callback.best_model_path if checkpoint_callback else None
 
         if self.global_rank == 0 and self.mp_queue is not None:
             rank_zero_warn("cleaning up ddp environment...")
@@ -183,7 +221,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, results):
             # TODO: is there a better way than accessing trainer through model -> trainer?
             if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
                 last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path)
-                atomic_save(self.lightning_module.state_dict(), last_path)
+                atomic_save(self.on_save(self.lightning_module.state_dict()), last_path)
 
             # todo, pass complete checkpoint as state dictionary
             self.mp_queue.put(best_model_path)
@@ -214,6 +252,11 @@ def model_to_device(self):
             torch.cuda.set_device(self.root_device)
         self.model.to(self.root_device)
 
+    def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int):
+        """Run before precision plugin executes backward"""
+        if not self.lightning_module.automatic_optimization and self.model.require_backward_grad_sync:
+            prepare_for_backward(self.model, closure_loss)
+
     def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None):
         if isinstance(output, torch.Tensor):
             output = sync_ddp_if_available(output, group, reduce_op)
@@ -227,3 +270,10 @@ def validation_step(self, *args, **kwargs):
 
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
+
+    def predict(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def post_training_step(self):
+        if not self.lightning_module.automatic_optimization:
+            self.model.require_backward_grad_sync = True
diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py
index 2bf4bbc0b4a96..d1a3e26e22693 100644
--- a/pytorch_lightning/plugins/training_type/dp.py
+++ b/pytorch_lightning/plugins/training_type/dp.py
@@ -27,6 +27,8 @@ def __init__(self, parallel_devices: List[torch.device]):
         super().__init__(parallel_devices=parallel_devices, cluster_environment=None)
 
     def setup(self, model):
+        # model needs to be moved to the device before it is wrapped
+        model.to(self.root_device)
         self._model = DataParallel(LightningParallelModule(model), self.parallel_devices)
 
     def reduce(self, output, *args, **kwargs):
@@ -63,3 +65,15 @@ def validation_step(self, *args, **kwargs):
 
     def test_step(self, *args, **kwargs):
         return self.model(*args, **kwargs)
+
+    def predict(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def training_step_end(self, output):
+        return self.reduce(output)
+
+    def validation_step_end(self, output):
+        return self.reduce(output)
+
+    def test_step_end(self, output):
+        return self.reduce(output)
diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py
index f45c3dcb93bb6..2393c040bcc8f 100644
--- a/pytorch_lightning/plugins/training_type/horovod.py
+++ b/pytorch_lightning/plugins/training_type/horovod.py
@@ -15,7 +15,7 @@
 from typing import Any, List, Optional, Union
 
 import torch
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import _LRScheduler, Optimizer
 
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin
@@ -45,6 +45,7 @@ def setup(self, model):
 
         self.global_rank = hvd.rank()
         self.local_rank = hvd.local_rank()
+        self.world_size = hvd.size()
         rank_zero_only.rank = self.global_rank
 
         self.model_to_device()
@@ -115,6 +116,9 @@ def broadcast(self, obj: object, src: int = 0) -> object:
         obj = hvd.broadcast_object(obj, src)
         return obj
 
+    def post_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int):
+        optimizer.synchronize()
+
     def model_to_device(self):
         if self.on_gpu:
             torch.cuda.set_device(self.root_device)
diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
index 91d44fbdaa5d1..a67dee93a6500 100644
--- a/pytorch_lightning/plugins/training_type/parallel.py
+++ b/pytorch_lightning/plugins/training_type/parallel.py
@@ -11,18 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import List, Optional
 
 import torch
+from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin
-from pytorch_lightning.utilities.distributed import ReduceOp
+from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, ReduceOp
 
 
 class ParallelPlugin(TrainingTypePlugin, ABC):
@@ -34,10 +36,17 @@ def __init__(
     ):
         super().__init__()
         self.parallel_devices = parallel_devices
-        self.local_rank = 0
         self.world_size = 1
+        self.local_rank = 0
         self.cluster_environment = cluster_environment
 
+    @property
+    def cluster_local_rank(self):
+        try:
+            return self.cluster_environment.local_rank()
+        except KeyError:
+            return 0
+
     @property
     @abstractmethod
     def root_device(self):
@@ -98,7 +107,18 @@ def block_backward_sync(self):
         This is useful for skipping sync when accumulating gradients, reducing communication overhead
         Returns: context manager with sync behaviour off
         """
-        if isinstance(self.model, LightningDistributedDataParallel):
-            yield self.model.no_sync()
+        if isinstance(self.model, DistributedDataParallel):
+            with self.model.no_sync():
+                yield None
         else:
             yield None
+
+    def broadcast(self, obj: object, src: int) -> object:
+        buffer = io.BytesIO()
+        torch.save(obj, buffer)
+        data = bytearray(buffer.getbuffer())
+        data_tensor = torch.tensor(data).to(self.root_device, dtype=torch.float)
+        data = all_gather_ddp_if_available(data_tensor)
+        buffer = io.BytesIO(data.cpu().byte().numpy())
+        obj = torch.load(buffer)
+        return obj
diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index 4aff83189b6bc..be81cd2a03c56 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import os
 from contextlib import suppress
-from typing import Optional
+from typing import Optional, Sequence
 
 import torch
 
@@ -25,6 +25,7 @@
 DEFAULT_RPC_TIMEOUT_SEC = 60.
 if _RPC_AVAILABLE:
     from torch.distributed import rpc
+
     with suppress(ModuleNotFoundError, ImportError):
         from torch.distributed.rpc.constants import DEFAULT_RPC_TIMEOUT_SEC
 
@@ -40,11 +41,11 @@ class RPCPlugin(DDPPlugin):
 
     def __init__(
         self,
-        parallel_devices,
-        num_nodes=1,
-        cluster_environment: ClusterEnvironment = None,
-        sync_batchnorm=False,
         rpc_timeout_sec: float = DEFAULT_RPC_TIMEOUT_SEC,
+        parallel_devices: Sequence[int] = (),
+        num_nodes: Optional[int] = None,
+        cluster_environment: Optional[ClusterEnvironment] = None,
+        sync_batchnorm: Optional[bool] = None,
         **kwargs
     ):
         self.rpc_timeout_sec = rpc_timeout_sec
@@ -76,60 +77,11 @@ def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> No
         """
         raise NotImplementedError
 
-    def on_main_rpc_connection(self, trainer) -> None:
-        """
-        Called when main rpc connection has been established.
-
-        Args:
-            trainer: The trainer object.
-        """
-        raise NotImplementedError
-
-    def on_accelerator_exit_rpc_process(self) -> None:
-        """
-        Called to exit RPC process within the accelerator, that is being managed by main process.
-
-        Args:
-            trainer: The trainer object.
-        """
-        self.exit_rpc_process()
-
     def exit_rpc_process(self):
         if self._is_rpc_initialized:
             torch.distributed.rpc.shutdown()
             self._is_rpc_initialized = False
 
     @property
-    def return_after_exit_rpc_process(self) -> bool:
-        """
-        Override to decide whether to skip train/test function after shutdown completed.
-        Usually RPC shutdown is a join/exit function, afterwards we want to exit the process.
-
-        Returns:
-            Whether to return after RPC exit.
-        """
-        raise NotImplementedError
-
-    def worker_optimizer_step(self, model: LightningModule, opt_idx: int, *args, **kwargs) -> None:
-        """
-        Called when optimizer step is run on the main process. Used to signal any RPC workers to run optimizer step.
-
-        Args:
-            model: The LightningModule.
-            opt_idx: The idx of the optimizer to carry out step on.
-        """
-        raise NotImplementedError
-
-    @property
-    def is_main_rpc_process(self) -> bool:
-        """
-        Override to add logic to determine current process is main RPC process.
-        """
-        raise NotImplementedError
-
-    def barrier(self, name: Optional[str] = None) -> None:
-        """
-        Override to define distributed sync communication. This needs to be handled differently due to
-        the RPC connection managing certain processes at the same time.
-        """
-        raise NotImplementedError
+    def rpc_enabled(self) -> bool:
+        return True
diff --git a/pytorch_lightning/plugins/training_type/rpc_sequential.py b/pytorch_lightning/plugins/training_type/rpc_sequential.py
index baff4289c75a1..331cbe76639f3 100644
--- a/pytorch_lightning/plugins/training_type/rpc_sequential.py
+++ b/pytorch_lightning/plugins/training_type/rpc_sequential.py
@@ -13,16 +13,16 @@
 # limitations under the License
 import logging
 import os
-from typing import Any, List, Optional
+from typing import List, Optional
 
 import torch
 import torch.distributed as torch_distrib
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
+from torch.optim import Optimizer
 
-from pytorch_lightning import LightningModule
-from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel
-from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.overrides.distributed import LightningDistributedModule
 from pytorch_lightning.plugins.training_type.rpc import DEFAULT_RPC_TIMEOUT_SEC, RPCPlugin
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE, rank_zero_only
@@ -42,11 +42,7 @@ class RPCSequentialPlugin(RPCPlugin):
 
     def __init__(
         self,
-        parallel_devices,
-        num_nodes: int = 1,
-        cluster_environment: ClusterEnvironment = None,
-        sync_batchnorm=False,
-        balance: Optional[List[int]] = None,
+        balance: List[int],
         microbatches: int = 8,
         checkpoint: str = 'except_last',
         balance_mode: str = "balance_by_size",
@@ -92,14 +88,7 @@ def __init__(
             `get_model_parallel_world_size() > 1`
         """
         self._check_pipe_available()
-        super().__init__(
-            parallel_devices=parallel_devices,
-            num_nodes=num_nodes,
-            cluster_environment=cluster_environment,
-            sync_batchnorm=sync_batchnorm,
-            rpc_timeout_sec=rpc_timeout_sec,
-            **kwargs
-        )
+        super().__init__(rpc_timeout_sec=rpc_timeout_sec, **kwargs)
 
         self.balance = balance
 
@@ -107,15 +96,18 @@ def __init__(
         self.checkpoint = checkpoint
         self.balance_mode = balance_mode
         self.pipelined_backward = pipelined_backward
-        self.main_rpc_process = False  # Updated by main process, default for all secondary processes
+        self._main_rpc_process = True
 
     def init_ddp_connection(
         self,
         global_rank: int,
         world_size: int,
     ) -> None:
-        # what is this used for?
-        self.prepared_for_backwards = False
+        if self.lightning_module.trainer.amp_backend is not None:
+            raise MisconfigurationException(
+                '`RPCSequentialPlugin` is currently not supported in Automatic Mixed Precision'
+            )
+
         if self._skip_init_connections():
             return
         super().init_ddp_connection(
@@ -129,21 +121,18 @@ def init_ddp_connection(
         self.set_main_rpc_process()
 
         self._check_sequential_model_exists(model)
+
+        # check if user given balance is valid
+        if self.balance is not None:
+            self._assert_valid_model_balance()
+
         if self.main_rpc_process:
             if self.balance is None:
                 self._infer_model_balance()
-            self._assert_valid_model_balance()
-
-        if not self.is_main_rpc_process:
-            self.on_accelerator_exit_rpc_process()
-            self.exit_rpc_process()
-            if self.return_after_exit_rpc_process:
-                return
+            self.init_pipe_module()
         else:
-            self.on_main_rpc_connection()
-
-    def on_before_manual_backward(self, model: LightningDistributedDataParallel, output: Any):
-        pass
+            self.handle_transferred_pipe_module()
+            self.exit_rpc_process()
 
     def _infer_model_balance(self):
         log.info(f'Inferring model balance using {self.balance_mode} mode')
@@ -197,6 +186,8 @@ def _find_and_init_pipe_module(self, model):
             model.sequential_module.module.model.trainer = model.trainer
             model.sequential_module.module.model.configure_optimizers = model.configure_optimizers
 
+            self.model = model
+
         else:
             raise MisconfigurationException(
                 'Could not find a PipeLightningModule within the model. '
@@ -239,21 +230,16 @@ def _infer_check_num_gpus(self):
         # Assume that the user wants to balance his model on all GPUs
         return self.world_size
 
-    def on_accelerator_exit_rpc_process(self) -> None:
+    def handle_transferred_pipe_module(self) -> None:
         if not self.lightning_module.running_stage == RunningStage.TESTING:
             torch_distrib.barrier()  # Ensure we await main process initialization
-
             # Add trainer/configure_optimizers to the pipe model for access in all worker processes
             rpc_pipe.PipeModel.trainer = self.lightning_module.trainer
             del rpc_pipe.PipeModel.trainer.model.sequential_module
             rpc_pipe.PipeModel.trainer.model.sequential_module = rpc_pipe.PipeModel
             rpc_pipe.PipeModel.configure_optimizers = self.lightning_module.configure_optimizers
-        super().on_accelerator_exit_rpc_process()
 
-    def set_main_rpc_process(self):
-        self.main_rpc_process = torch_distrib.get_rank(group=mpu.get_pipeline_parallel_group()) == 0
-
-    def on_main_rpc_connection(self) -> None:
+    def init_pipe_module(self) -> None:
         # Create pipe_module
         model = self.lightning_module
         self._find_and_init_pipe_module(model)
@@ -261,18 +247,23 @@ def on_main_rpc_connection(self) -> None:
             torch_distrib.barrier()  # Ensure we join main process initialization
             model.sequential_module.foreach_worker(register_optimizers, include_self=True)
 
-    # TODO: Move this to the connector
-    def _check_arguments(self, trainer):
-        if trainer.amp_backend is not None:
-            raise MisconfigurationException(
-                'DDPSequentialPlugin is currently not supported in Automatic Mixed Precision'
-            )
+            # TODO: Move this to the connector
+
+    def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int):
+        """Run before precision plugin executes backward"""
 
-    def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> DistributedDataParallel:
-        ddp_plugin = RPCPlugin(process_group=mpu.get_data_parallel_group()).configure_ddp(model, device_ids)
-        # Plugin handle backwards across processes. Currently not supported for DDP + pipe parallel
-        ddp_plugin.PREPARE_FOR_BACKWARDS = False
-        return ddp_plugin
+    def configure_ddp(self):
+        if self.main_rpc_process:
+            self.pre_configure_ddp()
+
+            self._model = DistributedDataParallel(
+                LightningDistributedModule(self.model),
+                device_ids=self.determine_ddp_device_ids(),
+                process_group=mpu.get_data_parallel_group(),
+                **self._ddp_kwargs,
+            )
+            # Plugin handle backwards across processes. Currently not supported for DDP + pipe parallel
+            self._model.require_backward_grad_sync = False
 
     @rank_zero_only
     def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None:
@@ -296,7 +287,8 @@ def worker_optimizer_step(self, model: LightningModule, opt_idx: int, *args, **k
             }, include_self=False
         )
 
-    def distributed_sampler_kwargs(self, distributed_sampler_kwargs):
+    @property
+    def distributed_sampler_kwargs(self):
         return dict(
             num_replicas=mpu.get_data_parallel_world_size(),
             rank=mpu.get_data_parallel_rank(),
@@ -306,16 +298,19 @@ def distributed_sampler_kwargs(self, distributed_sampler_kwargs):
     def data_parallel_group(self):
         return mpu.get_data_parallel_group()
 
-    @property
-    def is_main_rpc_process(self) -> bool:
-        return self.main_rpc_process
+    def set_main_rpc_process(self):
+        self.main_rpc_process = torch_distrib.get_rank(group=mpu.get_pipeline_parallel_group()) == 0
 
     @property
-    def return_after_exit_rpc_process(self) -> bool:
-        return True
+    def main_rpc_process(self) -> bool:
+        return self._main_rpc_process
+
+    @main_rpc_process.setter
+    def main_rpc_process(self, is_main_process):
+        self._main_rpc_process = is_main_process
 
     def barrier(self, name: Optional[str] = None) -> None:
-        if torch_distrib.is_initialized() and self.is_main_rpc_process:
+        if torch_distrib.is_initialized() and self.main_rpc_process:
             torch_distrib.barrier(group=self.data_parallel_group)
 
     def _check_pipe_available(self):
@@ -324,6 +319,24 @@ def _check_pipe_available(self):
                 'PipeRPCPlugin requires FairScale and currently is only supported on PyTorch 1.6.'
             )
 
+    def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, **kwargs) -> None:
+        """Hook to do something after each optimizer step."""
+        if self.rpc_enabled and self.main_rpc_process:
+            # Initialize optimizer step on main process
+            self.worker_optimizer_step(model=self.lightning_module, opt_idx=optimizer_idx, **kwargs)
+
+    def post_training(self):
+        if self.main_rpc_process:
+            super().post_training()
+
+    def start_training(self, trainer: 'Trainer') -> None:
+        if self.main_rpc_process:
+            super().start_training(trainer)
+
+    def start_testing(self, trainer: 'Trainer') -> None:
+        if self.main_rpc_process:
+            super().start_testing(trainer)
+
 
 class LightningPipeModule(nn.Module):
     """
diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
index 1ad436c7cdbb4..ad0ab693bee0d 100644
--- a/pytorch_lightning/plugins/training_type/sharded.py
+++ b/pytorch_lightning/plugins/training_type/sharded.py
@@ -1,21 +1,23 @@
 from typing import Optional
 
+from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
+    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
     from fairscale.optim import OSS
 
-    from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
+    from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded
 
 
 class DDPShardedPlugin(DDPPlugin):
 
     def configure_ddp(self):
         self._wrap_optimizers()
-        self._model = LightningShardedDataParallel(
-            self.model, sharded_optimizer=self.lightning_module.trainer.optimizers
+        self._model = ShardedDataParallel(
+            LightningShardedDataParallel(self.model), sharded_optimizer=self.lightning_module.trainer.optimizers
         )
 
     def _reinit_optimizers_with_oss(self):
@@ -29,7 +31,8 @@ def _reinit_optimizers_with_oss(self):
                 optimizers[x] = zero_optimizer
                 del optimizer
         trainer = self.lightning_module.trainer
-        trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
+        trainer.optimizers = optimizers
+        trainer.convert_to_lightning_optimizers()
 
     def _wrap_optimizers(self):
         trainer = self.model.trainer
@@ -50,3 +53,7 @@ def _optim_state_dict(self, optimizer):
         :meth:`consolidate_state_dict`.
         """
         return optimizer.state_dict()
+
+    @property
+    def lightning_module(self) -> LightningModule:
+        return unwrap_lightning_module_sharded(self._model)
diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py
index f71b28ebefb77..c38690473b77d 100644
--- a/pytorch_lightning/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py
@@ -1,35 +1,35 @@
 from typing import Optional
 
+from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.core.optimizer import is_lightning_optimizer
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
 
 if _FAIRSCALE_AVAILABLE:
+    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
     from fairscale.optim import OSS
 
-    from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel
+    from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded
 
 
 class DDPSpawnShardedPlugin(DDPSpawnPlugin):
 
     def configure_ddp(self):
         self._wrap_optimizers()
-        self._model = LightningShardedDataParallel(
-            self.model, sharded_optimizer=self.lightning_module.trainer.optimizers
+        self._model = ShardedDataParallel(
+            LightningShardedDataParallel(self.model), sharded_optimizer=self.lightning_module.trainer.optimizers
         )
 
     def _reinit_optimizers_with_oss(self):
         optimizers = self.lightning_module.trainer.optimizers
         for x, optimizer in enumerate(optimizers):
-            if is_lightning_optimizer(optimizer):
-                optimizer = optimizer._optimizer
             if not isinstance(optimizer, OSS):
                 optim_class = type(optimizer)
                 zero_optimizer = OSS(params=optimizer.param_groups, optim=optim_class, **optimizer.defaults)
                 optimizers[x] = zero_optimizer
                 del optimizer
         trainer = self.lightning_module.trainer
-        trainer.optimizers = trainer.convert_to_lightning_optimizers(optimizers)
+        trainer.optimizers = optimizers
 
     def _wrap_optimizers(self):
         trainer = self.model.trainer
@@ -38,9 +38,6 @@ def _wrap_optimizers(self):
         self._reinit_optimizers_with_oss()
 
     def optimizer_state(self, optimizer: 'OSS') -> Optional[dict]:
-        if is_lightning_optimizer(optimizer):
-            optimizer = optimizer._optimizer
-
         if isinstance(optimizer, OSS):
             optimizer.consolidate_state_dict()
         return self._optim_state_dict(optimizer)
@@ -52,3 +49,7 @@ def _optim_state_dict(self, optimizer):
         :meth:`consolidate_state_dict`.
         """
         return optimizer.state_dict()
+
+    @property
+    def lightning_module(self) -> LightningModule:
+        return unwrap_lightning_module_sharded(self._model)
diff --git a/pytorch_lightning/plugins/training_type/single_tpu.py b/pytorch_lightning/plugins/training_type/single_tpu.py
index cf0307a29e73a..46df404bdc02f 100644
--- a/pytorch_lightning/plugins/training_type/single_tpu.py
+++ b/pytorch_lightning/plugins/training_type/single_tpu.py
@@ -1,12 +1,14 @@
 import io
 import os
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
+from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin
 from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle
 from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities.apply_func import move_data_to_device
 
 if _TPU_AVAILABLE:
     import torch_xla
@@ -15,7 +17,9 @@
 
 class SingleTPUPlugin(SingleDevicePlugin):
 
-    def __init__(self, device: torch.device):
+    def __init__(self, device: Union[torch.device, int]):
+        if isinstance(device, int):
+            device = xm.xla_device(device)
         super().__init__(device)
 
         self.tpu_local_core_rank = 0
@@ -24,6 +28,14 @@ def __init__(self, device: torch.device):
     def on_tpu(self) -> bool:
         return True
 
+    def connect(self, model: torch.nn.Module) -> torch.nn.Module:
+        self._model = model
+        self.model_to_device()
+        return self._model
+
+    def model_to_device(self) -> None:
+        self._model.to(self.root_device)
+
     def pre_training(self) -> None:
         if isinstance(self.device, int):
             self.device = xm.xla_device(self.device)
@@ -37,3 +49,23 @@ def post_training(self) -> None:
         if on_colab_kaggle():
             rank_zero_warn("cleaning up... please do not interrupt")
             self.save_spawn_weights(model)
+
+    def save_spawn_weights(self, model: LightningModule) -> Optional[str]:
+        """
+        Dump a temporary checkpoint after ddp ends to get weights out of the process
+        """
+        path = os.path.join(model.trainer.default_root_dir, "__temp_weight_distributed_end.ckpt")
+        model.trainer.save_checkpoint(path)
+        return path
+
+    def on_save(self, checkpoint: dict) -> dict:
+        """
+        Move XLA tensors to CPU before saving
+        Recommended on XLA Guide:
+        https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors
+        """
+        return move_data_to_device(checkpoint, torch.device("cpu"))
+
+    @property
+    def is_distributed(self):
+        return False
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 0f516e2b0b046..4c5844da94ced 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -1,14 +1,15 @@
 import io
 import os
-from typing import Any, Dict, Iterable, Optional, Sequence, Union
+import re
+from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union
 
 import torch
+import torch.multiprocessing as mp
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle
 from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn
-from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.distributed import rank_zero_only
 from pytorch_lightning.utilities.seed import seed_everything
 
@@ -31,10 +32,28 @@ def __init__(self, parallel_devices: Sequence[int], num_nodes: int = 1, **kwargs
         self.tpu_local_core_rank = 0
         self.start_method = None
 
+    def connect(self, model: torch.nn.Module) -> torch.nn.Module:
+        self.create_mp_queue()
+        self._model = model
+        return self._model
+
+    def create_mp_queue(self):
+        self.start_method = 'fork'
+        smp = mp.get_context(self.start_method)
+        self.mp_queue = smp.SimpleQueue()
+
     @property
     def distributed_sampler_kwargs(self) -> dict:
         return dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
 
+    @property
+    def should_finalize(self):
+        return self.world_size == 1
+
+    @property
+    def is_distributed(self):
+        return self.world_size != 1
+
     def process_dataloader(self, dataloader: Union[Iterable, torch.utils.data.DataLoader]) -> ParallelLoader:
         device = xm.xla_device()
         dataloader = xla_pl.ParallelLoader(dataloader, [device])
@@ -53,7 +72,9 @@ def set_world_ranks(self, process_idx: int) -> None:
         self.global_rank = self.tpu_local_core_rank
         self.world_size = self.num_nodes * self.num_processes
 
-    def new_process(self, process_idx: int, trainer) -> None:
+    def new_process(self, process_idx: int, trainer, mp_queue) -> None:
+        self.mp_queue = mp_queue
+
         seed = os.environ.get("PL_GLOBAL_SEED")
         if seed is not None:
             seed_everything(int(seed))
@@ -67,6 +88,11 @@ def new_process(self, process_idx: int, trainer) -> None:
             trainer.progress_bar_callback.disable()
 
         self.model_to_device()
+        trainer.accelerator_backend.setup_optimizers(trainer)
+        trainer.precision_plugin.connect(self._model, None, None)
+
+        # replace trainer save_checkpoint to use `xm.save`
+        trainer.save_checkpoint = self.save_checkpoint
         self.barrier()
 
         if trainer.testing:
@@ -77,25 +103,37 @@ def new_process(self, process_idx: int, trainer) -> None:
         self.__save_end_of_training_weights(self.lightning_module)
         self.transfer_distrib_spawn_state_on_fit_end(results)
 
-    def __save_end_of_training_weights(self, model: LightningModule, trainer) -> None:
+    def __save_end_of_training_weights(self, model: LightningModule) -> None:
         # when training ends on these platforms dump weights to get out of the main process
         if on_colab_kaggle():
             rank_zero_warn("cleaning up... please do not interrupt")
             self.save_spawn_weights(model)
 
     def model_to_device(self) -> None:
-        pass
+        self._model.to(xm.xla_device())
 
     def barrier(self, name: Optional[str] = None) -> None:
         rendezvous(f"pl.Trainer.{name}")
 
-    def on_save(self, checkpoint: dict) -> dict:
-        """
-        Move XLA tensors to CPU before saving
-        Recommended on XLA Guide:
-        https://github.com/pytorch/xla/blob/master/API_GUIDE.md#saving-and-loading-xla-tensors
-        """
-        return move_data_to_device(checkpoint, torch.device("cpu"))
+    def transfer_distrib_spawn_state_on_fit_end(self, results):
+        # TODO: is there a better way than accessing callback through model -> trainer -> callback?
+        best_model_path = self.lightning_module.trainer.checkpoint_callback.best_model_path
+
+        if self.mp_queue is not None:
+            rank_zero_warn("cleaning up ddp environment...")
+
+            # save the last weights
+            last_path = None
+            # TODO: is there a better way than accessing trainer through model -> trainer?
+            if not self.lightning_module.trainer.testing and best_model_path is not None and len(best_model_path) > 0:
+                last_path = re.sub(".ckpt", ".tmp_end.ckpt", best_model_path)
+                xm.save(self.lightning_module.state_dict(), last_path)
+
+            if self.global_rank == 0:
+                # todo, pass complete checkpoint as state dictionary
+                self.mp_queue.put(best_model_path)
+                self.mp_queue.put(last_path)
+                self.mp_queue.put(results)
 
     def broadcast(self, obj: object, src: int = 0) -> object:
         buffer = io.BytesIO()
@@ -150,8 +188,8 @@ def post_training(self) -> None:
 
         # restore main state with best weights
         best_path = self.mp_queue.get()
-        results = self.mp_queue.get()
         last_path = self.mp_queue.get()
+        results = self.mp_queue.get()
 
         # transfer back the best path to the trainer
         if self.lightning_module.trainer.checkpoint_callback is not None:
@@ -163,7 +201,7 @@ def post_training(self) -> None:
             ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt)
 
-        self.lightning_module = model
+        self._model = model
 
         # when training completes, load the weights back in main process
         self.__load_weights_on_main_process()
@@ -173,21 +211,48 @@ def __load_weights_on_main_process(self) -> None:
 
         # load weights if not interrupted
         # TODO: check for trainer reference
-        if self.on_colab_kaggle and not model.trainer.testing:
+        if on_colab_kaggle() and not model.trainer.testing:
             self.load_spawn_weights(model)
 
-        self.lightning_module = model
+        self._model = model
 
     @property
     def xmp_spawn_kwargs(self):
         return {
-            "args": (self.lightning_module, trainer, self.mp_queue),
-            "nproc": len(self.parallel_devices),
+            "args": (self.lightning_module.trainer, self.mp_queue),
+            "nprocs": len(self.parallel_devices),
             "start_method": self.start_method
         }
 
     def start_training(self, trainer) -> None:
+        # todo: precision pluging is call in accelerator setup and should be moved
+        if 'XLA_USE_BF16' in os.environ:
+            del os.environ["XLA_USE_BF16"]
         xmp.spawn(self.new_process, **self.xmp_spawn_kwargs)
 
     def start_testing(self, trainer) -> None:
         xmp.spawn(self.new_process, **self.xmp_spawn_kwargs)
+
+    def training_step(self, *args, **kwargs):
+        return self.lightning_module.training_step(*args, **kwargs)
+
+    def validation_step(self, *args, **kwargs):
+        return self.lightning_module.validation_step(*args, **kwargs)
+
+    def test_step(self, *args, **kwargs):
+        return self.lightning_module.test_step(*args, **kwargs)
+
+    def predict(self, *args, **kwargs):
+        return self.lightning_module.predict(*args, **kwargs)
+
+    def save_checkpoint(self, filepath, weights_only: bool = False):
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            filepath: write-target file's path
+            weights_only: saving model weights only
+        """
+        # dump states as a checkpoint dictionary object
+        _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
+        # Todo: TypeError: 'mappingproxy' object does not support item assignment
+        xm.save({k: v for k, v in _checkpoint.items() if k != "callbacks"}, filepath)
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 89f2329512e5e..db0e390c4b03e 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Optional, Sequence, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
+from torch.nn import Module
+from torch.optim import Optimizer
 
-from pytorch_lightning import _logger as log
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.plugins.base_plugin import Plugin
 
 if TYPE_CHECKING:
@@ -33,6 +35,10 @@ def __init__(self) -> None:
         self._results = None
         self.global_rank = 0
 
+    @property
+    def should_finalize(self):
+        return True
+
     @property
     @abstractmethod
     def on_gpu(self) -> bool:
@@ -64,35 +70,32 @@ def barrier(self, name: Optional[str] = None) -> None:
     def broadcast(self, obj: object, src: int = 0) -> object:
         """Broadcasts an object to all processes"""
 
-    # TODO method this is currently unused. Check after complete refactors are pushed
-    def set_nvidia_flags(self, is_slurm_managing_tasks: bool, device_ids: Optional[Sequence]) -> None:
-        if device_ids is None:
-            return
-
-        # set the correct cuda visible devices (using pci order)
-        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
-        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
-        if self.lightning_module is not None:
-            log.info(f"LOCAL_RANK: {self.lightning_module.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
-
     def reduce_early_stopping_decision(self, should_stop: bool) -> bool:
         """Reduce the early stopping decision across all possibly spawned processes"""
         return should_stop
 
+    def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int):
+        """Run before precision plugin executes backward"""
+
+    def post_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int):
+        """Run after precision plugin executes backward"""
+
+    def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, **kwargs) -> None:
+        """Hook to do something after each optimizer step."""
+
     @property
-    def model(self) -> torch.nn.Module:
+    def model(self) -> Module:
         """Returns the potentially wrapped LightningModule"""
         return self._model
 
     @model.setter
-    def model(self, new_model: torch.nn.Module) -> None:
+    def model(self, new_model: Module) -> None:
         self._model = new_model
 
     @property
     def lightning_module(self) -> Optional[LightningModule]:
         """Returns the pure LightningModule without potential wrappers"""
-        return self._model
+        return unwrap_lightning_module(self._model)
 
     @property
     def results(self) -> Any:
@@ -118,8 +121,26 @@ def start_testing(self, trainer: 'Trainer') -> None:
     def training_step(self, *args, **kwargs):
         return self.lightning_module.training_step(*args, **kwargs)
 
+    def post_training_step(self):
+        pass
+
     def validation_step(self, *args, **kwargs):
         return self.lightning_module.validation_step(*args, **kwargs)
 
     def test_step(self, *args, **kwargs):
         return self.lightning_module.test_step(*args, **kwargs)
+
+    def predict(self, *args, **kwargs):
+        return self.lightning_module.predict(*args, **kwargs)
+
+    def training_step_end(self, output):
+        return output
+
+    def validation_step_end(self, output):
+        return output
+
+    def test_step_end(self, output):
+        return output
+
+    def on_save(self, checkpoint: dict) -> dict:
+        return checkpoint
diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py
index cc3655a549910..a11394734f97b 100644
--- a/pytorch_lightning/trainer/callback_hook.py
+++ b/pytorch_lightning/trainer/callback_hook.py
@@ -209,11 +209,15 @@ def on_save_checkpoint(self):
     def on_load_checkpoint(self, checkpoint):
         """Called when loading a model checkpoint."""
         callback_states = checkpoint.get('callbacks')
-        for callback in self.callbacks:
-            state = callback_states.get(type(callback))
-            if state:
-                state = deepcopy(state)
-                callback.on_load_checkpoint(state)
+        # Todo: the `callback_states` are dropped with TPUSpawn as they
+        # can't be saved using `xm.save`
+        # https://github.com/pytorch/xla/issues/2773
+        if callback_states is not None:
+            for callback in self.callbacks:
+                state = callback_states.get(type(callback))
+                if state:
+                    state = deepcopy(state)
+                    callback.on_load_checkpoint(state)
 
     def on_after_backward(self):
         """
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index e3f50a691ca5a..2fca7b410f3e1 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -73,7 +73,7 @@ def restore_weights(self) -> None:
             self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer._device_type == DeviceType.GPU)
 
         # wait for all to catch up
-        self.trainer.accelerator_backend.barrier('TrainerIOMixin.restore_weights')
+        self.trainer.training_type_plugin.barrier('TrainerIOMixin.restore_weights')
 
         # clear cache after restore
         if self.trainer._device_type == DeviceType.GPU:
@@ -400,11 +400,11 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
         """
         # dump states as a checkpoint dictionary object
         checkpoint = self.dump_checkpoint(weights_only)
-
         if self.trainer.is_global_zero:
             # write the checkpoint dictionary on the file
-            if self.trainer.accelerator_backend:
-                checkpoint = self.trainer.accelerator_backend.on_save(checkpoint)
+
+            if self.trainer.training_type_plugin:
+                checkpoint = self.trainer.training_type_plugin.on_save(checkpoint)
             try:
                 atomic_save(checkpoint, filepath)
             except AttributeError as err:
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 439e9046726ce..595a5e84bf630 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -32,8 +32,9 @@
 
 class LoggerConnector:
 
-    def __init__(self, trainer):
+    def __init__(self, trainer, log_gpu_memory: bool):
         self.trainer = trainer
+        self.log_gpu_memory = log_gpu_memory
         self._callback_metrics = MetricsHolder()
         self._evaluation_callback_metrics = MetricsHolder(to_float=True)
         self._logged_metrics = MetricsHolder()
@@ -218,8 +219,8 @@ def log_metrics(self, metrics, grad_norm_dic, step=None):
                 and global_step for the rest.
         """
         # add gpu memory
-        if self.trainer._device_type == DeviceType.GPU and self.trainer.log_gpu_memory:
-            mem_map = memory.get_memory_profile(self.trainer.log_gpu_memory)
+        if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
+            mem_map = memory.get_memory_profile(self.log_gpu_memory)
             metrics.update(mem_map)
 
         # add norms
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py b/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py
index 394e4285d3a9b..82f328a927485 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/metrics_holder.py
@@ -17,7 +17,6 @@
 import torch
 
 from pytorch_lightning.metrics.metric import Metric
-from pytorch_lightning.utilities import _TPU_AVAILABLE
 
 
 class MetricsHolder:
@@ -73,7 +72,7 @@ def _convert_to_tensor(self, current: Any, use_tpu: bool, device: torch.device):
                 else:
                     current = torch.tensor(current, device=device, dtype=torch.float)
 
-        if use_tpu and _TPU_AVAILABLE:
+        if isinstance(current, torch.Tensor) and current.device.type == "xla":
             current = current.cpu()
 
         return current
diff --git a/pytorch_lightning/trainer/connectors/model_connector.py b/pytorch_lightning/trainer/connectors/model_connector.py
index 6a303b9822085..060601049f9b7 100644
--- a/pytorch_lightning/trainer/connectors/model_connector.py
+++ b/pytorch_lightning/trainer/connectors/model_connector.py
@@ -36,14 +36,12 @@ def copy_trainer_model_properties(self, model):
             m._distrib_type = str(self.trainer._distrib_type)
             m.use_amp = self.trainer.amp_backend is not None
             m.testing = self.trainer.testing
-            m.tpu_local_core_rank = self.trainer.tpu_local_core_rank
-            m.tpu_global_core_rank = self.trainer.tpu_global_core_rank
             m.precision = self.trainer.precision
 
     def get_model(self):
         return self._get_reference_model(self.trainer.model)
 
     def _get_reference_model(self, model):
-        if self.trainer.accelerator_backend:
-            return self.trainer.accelerator_backend.get_reference_model(model)
+        if self.trainer.accelerator_backend and self.trainer.accelerator_backend.lightning_module:
+            return self.trainer.accelerator_backend.lightning_module
         return model
diff --git a/pytorch_lightning/trainer/connectors/slurm_connector.py b/pytorch_lightning/trainer/connectors/slurm_connector.py
index 5932937f6cc85..02552dd67de26 100644
--- a/pytorch_lightning/trainer/connectors/slurm_connector.py
+++ b/pytorch_lightning/trainer/connectors/slurm_connector.py
@@ -1,14 +1,8 @@
 import os
-import re
 import signal
 from subprocess import call
 
-import torch
-import torch.distributed as torch_distrib
-
 from pytorch_lightning import _logger as log
-from pytorch_lightning.utilities import DeviceType, DistributedType
-from pytorch_lightning.utilities.distributed import rank_zero_info
 
 
 class SLURMConnector:
@@ -16,57 +10,6 @@ class SLURMConnector:
     def __init__(self, trainer):
         self.trainer = trainer
 
-    def on_trainer_init(self, num_gpu_nodes):
-        self.configure_slurm_ddp(num_gpu_nodes)
-
-    def configure_slurm_ddp(self, num_gpu_nodes):
-        self.trainer.is_slurm_managing_tasks = False
-
-        # extract SLURM flag vars
-        # whenever we have the correct number of tasks, we let slurm manage processes
-        # otherwise we launch the required number of processes
-        if self.trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2):
-            self.trainer.num_requested_gpus = self.trainer.num_gpus * num_gpu_nodes
-            self.trainer.num_slurm_tasks = 0
-            try:
-                self.trainer.num_slurm_tasks = int(os.environ['SLURM_NTASKS'])
-                self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_requested_gpus
-
-                # enable slurm cpu
-                if self.trainer.num_requested_gpus == 0:
-                    self.trainer.is_slurm_managing_tasks = self.trainer.num_slurm_tasks == self.trainer.num_processes
-
-                # in interactive mode we don't manage tasks
-                job_name = os.environ['SLURM_JOB_NAME']
-                if job_name == 'bash':
-                    self.trainer.is_slurm_managing_tasks = False
-            # todo: specify the possible exception
-            except Exception:
-                # likely not on slurm, so set the slurm managed flag to false
-                self.trainer.is_slurm_managing_tasks = False
-
-        # used for tests only, set this flag to simulate slurm managing a task
-        should_fake = os.environ.get('FAKE_SLURM_MANAGING_TASKS')
-        if should_fake and int(should_fake):
-            self.trainer.is_slurm_managing_tasks = True
-
-        # notify user the that slurm is managing tasks
-        if self.trainer.is_slurm_managing_tasks:
-            rank_zero_info('Multi-processing is handled by Slurm.')
-
-    # todo: the same function as slurm_environment.py `_resolve_root_node_address`
-    def resolve_root_node_address(self, root_node):
-        if '[' in root_node:
-            name, numbers = root_node.split('[', maxsplit=1)
-            number = numbers.split(',', maxsplit=1)[0]
-            if '-' in number:
-                number = number.split('-')[0]
-
-            number = re.sub('[^0-9]', '', number)
-            root_node = name + number
-
-        return root_node
-
     def register_slurm_signal_handlers(self):
         # see if we're using slurm (not interactive)
         on_slurm = False
@@ -112,44 +55,3 @@ def term_handler(self, signum, frame):
         # Todo: required argument `signum` is not used
         # Todo: required argument `frame` is not used
         log.info("bypassing sigterm")
-
-    # todo: this is the same func as slurm_environment.py `master_port`
-    def connect_ddp(self, global_rank: int, world_size: int) -> None:
-        """
-        Sets up environment variables necessary for pytorch distributed communications
-        based on slurm environment.
-        """
-        # use slurm job id for the port number
-        # guarantees unique ports across jobs from same grid search
-        default_port = os.environ.get("SLURM_JOB_ID")
-        if default_port:
-            # use the last 4 numbers in the job id as the id
-            default_port = default_port[-4:]
-            # all ports should be in the 10k+ range
-            default_port = int(default_port) + 15000
-        else:
-            default_port = 12910
-
-        # if user gave a port number, use that one instead
-        if "MASTER_PORT" in os.environ:
-            default_port = os.environ["MASTER_PORT"]
-        else:
-            os.environ["MASTER_PORT"] = str(default_port)
-        log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}")
-
-        # figure out the root node addr
-        root_node = os.environ.get("SLURM_NODELIST")
-        if root_node:
-            root_node = root_node.split(" ")[0].split(",")[0]
-        else:
-            root_node = "127.0.0.1"
-
-        root_node = self.trainer.slurm_connector.resolve_root_node_address(root_node)
-        os.environ["MASTER_ADDR"] = root_node
-        log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}")
-
-        torch_backend = "nccl" if self.trainer._device_type == DeviceType.GPU else "gloo"
-
-        if not torch.distributed.is_initialized():
-            log.info(f"initializing ddp (SLURM): GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
-            torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index f319dd6594140..b02f768361ec3 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -62,7 +62,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None:
 
         # ddp_spawn + num_workers > 0 don't mix! tell the user
         is_dataloader = isinstance(dataloader, DataLoader)
-        using_spawn = self.distributed_backend == "ddp_spawn"
+        using_spawn = self.accelerator_connector.distributed_backend == "ddp_spawn"
         if is_dataloader and not on_windows:
             if dataloader.num_workers > 0 and using_spawn:
                 rank_zero_warn(
@@ -97,8 +97,10 @@ def auto_add_sampler(self, dataloader: DataLoader, shuffle: bool) -> DataLoader:
         if not is_dataloader or is_iterable_ds:
             return dataloader
 
-        need_dist_sampler = self.require_distributed_sampler and not isinstance(dataloader.sampler, DistributedSampler)
-        if self.replace_sampler_ddp and need_dist_sampler:
+        need_dist_sampler = self.accelerator_connector.is_distributed and not isinstance(
+            dataloader.sampler, DistributedSampler
+        )
+        if self.accelerator_connector.replace_sampler_ddp and need_dist_sampler:
             if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)):
                 raise MisconfigurationException(
                     'You seem to have configured a sampler in your DataLoader. This will be replaced '
@@ -385,7 +387,7 @@ def request_dataloader(self, dataloader_fx: Callable) -> DataLoader:
         dataloader = self._flatten_dl_only(dataloader)
 
         if self.accelerator_backend is not None:
-            self.accelerator_backend.barrier('get_dataloaders')
+            self.training_type_plugin.barrier('get_dataloaders')
         return dataloader
 
     def _flatten_dl_only(self, dataloaders):
diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py
index e0c79c20cfbbe..a6aeeb7d73f78 100644
--- a/pytorch_lightning/trainer/deprecated_api.py
+++ b/pytorch_lightning/trainer/deprecated_api.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities import DeviceType, DistributedType, rank_zero_warn
 
@@ -21,28 +22,29 @@ class DeprecatedDistDeviceAttributes:
     _device_type: DeviceType
     _running_stage: RunningStage
     num_gpus: int
+    accelerator_connector: BackendConnector
 
     @property
     def on_cpu(self) -> bool:
         rank_zero_warn("Internal: `on_cpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._device_type == DeviceType.CPU
+        return self.accelerator_connector._device_type == DeviceType.CPU
 
     @on_cpu.setter
     def on_cpu(self, val: bool) -> None:
         rank_zero_warn("Internal: `on_cpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._device_type = DeviceType.CPU
+            self.accelerator_connector._device_type = DeviceType.CPU
 
     @property
     def on_tpu(self) -> bool:
         rank_zero_warn("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._device_type == DeviceType.TPU
+        return self.accelerator_connector._device_type == DeviceType.TPU
 
     @on_tpu.setter
     def on_tpu(self, val: bool) -> None:
         rank_zero_warn("Internal: `on_tpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._device_type = DeviceType.TPU
+            self.accelerator_connector._device_type = DeviceType.TPU
 
     @property
     def use_tpu(self) -> bool:
@@ -57,57 +59,57 @@ def use_tpu(self, val: bool) -> None:
     @property
     def on_gpu(self) -> bool:
         rank_zero_warn("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._device_type == DeviceType.GPU
+        return self.accelerator_connector._device_type == DeviceType.GPU
 
     @on_gpu.setter
     def on_gpu(self, val: bool) -> None:
         rank_zero_warn("Internal: `on_gpu` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._device_type = DeviceType.GPU
+            self.accelerator_connector._device_type = DeviceType.GPU
 
     @property
     def use_dp(self) -> bool:
         rank_zero_warn("Internal: `use_dp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._distrib_type == DistributedType.DP
+        return self.accelerator_connector._distrib_type == DistributedType.DP
 
     @use_dp.setter
     def use_dp(self, val: bool) -> None:
         rank_zero_warn("Internal: `use_dp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._distrib_type = DistributedType.DP
+            self.accelerator_connector._distrib_type = DistributedType.DP
 
     @property
     def use_ddp(self) -> bool:
         rank_zero_warn("Internal: `use_ddp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
+        return self.accelerator_connector._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
 
     @use_ddp.setter
     def use_ddp(self, val: bool) -> None:
         rank_zero_warn("Internal: `use_ddp` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._distrib_type = DistributedType.DDP
+            self.accelerator_connector._distrib_type = DistributedType.DDP
 
     @property
     def use_ddp2(self) -> bool:
         rank_zero_warn("Internal: `use_ddp2` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._distrib_type == DistributedType.DDP2
+        return self.accelerator_connector._distrib_type == DistributedType.DDP2
 
     @use_ddp2.setter
     def use_ddp2(self, val: bool) -> None:
         rank_zero_warn("Internal: `use_ddp2` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._distrib_type = DistributedType.DDP2
+            self.accelerator_connector._distrib_type = DistributedType.DDP2
 
     @property
     def use_horovod(self) -> bool:
         rank_zero_warn("Internal: `use_horovod` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
-        return self._distrib_type == DistributedType.HOROVOD
+        return self.accelerator_connector._distrib_type == DistributedType.HOROVOD
 
     @use_horovod.setter
     def use_horovod(self, val: bool) -> None:
         rank_zero_warn("Internal: `use_horovod` is deprecated in v1.2 and will be removed in v1.4.", DeprecationWarning)
         if val:
-            self._distrib_type = DistributedType.HOROVOD
+            self.accelerator_connector._distrib_type = DistributedType.HOROVOD
 
     @property
     def use_single_gpu(self) -> bool:
@@ -116,8 +118,8 @@ def use_single_gpu(self) -> bool:
         )
         # todo, limiting to exclude DDP2 is not clear but it comes from connectors...
         return (
-            self._device_type and self._device_type == DeviceType.GPU and self.num_gpus == 1
-            and self._distrib_type != DistributedType.DDP2
+            self.accelerator_connector._device_type and self.accelerator_connector._device_type == DeviceType.GPU
+            and self.num_gpus == 1 and self.accelerator_connector._distrib_type not in (DistributedType.DDP2, )
         )
 
     @use_single_gpu.setter
@@ -127,4 +129,4 @@ def use_single_gpu(self, val: bool) -> None:
             DeprecationWarning,
         )
         if val:
-            self._device_type = DeviceType.GPU
+            self.accelerator_connector._device_type = DeviceType.GPU
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 6793a370fdc35..eaf2231f5d771 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -151,26 +151,6 @@ def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
                 raise ValueError(f'The provided lr scheduler "{scheduler}" is invalid')
         return lr_schedulers
 
-    def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
-        # Reinitialize optimizer.step properties added by schedulers
-        for scheduler in schedulers:
-            scheduler = scheduler['scheduler']
-            state = None
-
-            for optimizer in optimizers:
-                # check that we dont mix users optimizers and schedulers
-                if scheduler.optimizer == optimizer:
-                    # Find the mro belonging to the base lr scheduler class
-                    for i, mro in enumerate(scheduler.__class__.__mro__):
-                        if mro in (optim.lr_scheduler._LRScheduler, optim.lr_scheduler.ReduceLROnPlateau):
-                            state = scheduler.state_dict()
-                            scheduler.__class__.__mro__[i].__init__(scheduler, optimizer)
-                            scheduler.load_state_dict(state)
-                            break
-
-                if state is not None:
-                    break
-
 
 class _MockOptimizer(Optimizer):
     """The `_MockOptimizer` will be used inplace of an optimizer in the event that `None`
diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py
index f6e62abe0b007..ee6d70f42f247 100644
--- a/pytorch_lightning/trainer/properties.py
+++ b/pytorch_lightning/trainer/properties.py
@@ -15,16 +15,15 @@
 import os
 from abc import ABC
 from argparse import ArgumentParser, Namespace
-from typing import cast, List, Optional, Type, TypeVar, Union
+from typing import Any, cast, List, Optional, Type, TypeVar, Union
 
+import torch
+
+from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
 from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
-from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint, ProgressBarBase
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ProgressBarBase
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.loggers.base import LightningLoggerBase
-from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
-from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
 from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
-from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _HOROVOD_AVAILABLE, _TPU_AVAILABLE, DeviceType, DistributedType, rank_zero_warn
 from pytorch_lightning.utilities.argparse import (
@@ -34,7 +33,6 @@
     parse_env_variables,
 )
 from pytorch_lightning.utilities.cloud_io import get_filesystem
-from pytorch_lightning.utilities.model_helpers import is_overridden
 
 if _TPU_AVAILABLE:
     import torch_xla.core.xla_model as xm
@@ -42,6 +40,9 @@
 if _HOROVOD_AVAILABLE:
     import horovod.torch as hvd
 
+from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
+from pytorch_lightning.utilities.model_helpers import is_overridden
+
 
 class TrainerProperties(ABC):
 
@@ -59,14 +60,84 @@ class TrainerProperties(ABC):
     _default_root_dir: str
     _weights_save_path: str
     accelerator_backend: Accelerator
-    logger: LightningLoggerBase
-    model_connector: ModelConnector
-    checkpoint_connector: CheckpointConnector
-    callbacks: List[Callback]
     num_nodes: int
     num_processes: int
+    accelerator_connector: BackendConnector
     _lightning_optimizers = None
 
+    @property
+    def accelerator(self):
+        return self.accelerator_connector.accelerator
+
+    @property
+    def accelerator_backend(self):
+        # for backward compatibility
+        return self.accelerator
+
+    @property
+    def distributed_backend(self):
+        # for backward compatibility
+        return self.accelerator_connector.distributed_backend
+
+    @property
+    def training_type_plugin(self):
+        return self.accelerator.training_type_plugin
+
+    @property
+    def precision_plugin(self):
+        return self.accelerator.precision_plugin
+
+    @property
+    def global_rank(self):
+        return self.accelerator.training_type_plugin.global_rank
+
+    @property
+    def local_rank(self):
+        # some training types define a local rank
+        return getattr(self.accelerator.training_type_plugin, "local_rank", 0)
+
+    @property
+    def node_rank(self):
+        # some training types define a local rank
+        return getattr(self.accelerator.training_type_plugin, "node_rank", 0)
+
+    @property
+    def world_size(self):
+        # some training types define a world size
+        return getattr(self.accelerator.training_type_plugin, "world_size", 1)
+
+    @property
+    def _distrib_type(self):
+        return self.accelerator_connector._distrib_type
+
+    @property
+    def _device_type(self):
+        return self.accelerator_connector._device_type
+
+    @property
+    def num_nodes(self):
+        return self.accelerator_connector.num_nodes
+
+    @property
+    def num_processes(self):
+        return self.accelerator_connector.num_processes
+
+    @property
+    def root_gpu(self):
+        return self.accelerator_connector.root_gpu
+
+    @property
+    def tpu_cores(self) -> int:
+        return self.accelerator_connector.tpu_cores
+
+    @property
+    def num_gpus(self) -> int:
+        return self.accelerator_connector.num_gpus
+
+    @property
+    def data_parallel_device_ids(self):
+        return self.accelerator_connector.parallel_device_ids
+
     @property
     def log_dir(self):
         if self.logger is None:
@@ -74,8 +145,7 @@ def log_dir(self):
         else:
             dirpath = getattr(self.logger, 'log_dir' if isinstance(self.logger, TensorBoardLogger) else 'save_dir')
 
-        if self.accelerator_backend is not None:
-            dirpath = self.accelerator_backend.broadcast(dirpath)
+        dirpath = self.training_type_plugin.broadcast(dirpath)
         return dirpath
 
     @property
@@ -166,11 +236,8 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser:
         return add_argparse_args(cls, parent_parser)
 
     @property
-    def num_gpus(self) -> int:
-        gpus = self.data_parallel_device_ids
-        if gpus is None:
-            return 0
-        return len(gpus)
+    def gpus(self) -> Optional[Union[List[int], str, int]]:
+        return self.accelerator_connector.gpus
 
     @property
     def data_parallel(self) -> bool:
@@ -210,7 +277,7 @@ def disable_validation(self) -> bool:
     @property
     def enable_validation(self) -> bool:
         """ Check if we should run validation during training. """
-        model_ref = self.model_connector.get_model()
+        model_ref = self.get_model()
         val_loop_enabled = is_overridden('validation_step', model_ref) and self.limit_val_batches > 0
         return val_loop_enabled
 
@@ -271,8 +338,31 @@ def checkpoint_callbacks(self) -> List[ModelCheckpoint]:
     def save_checkpoint(self, filepath, weights_only: bool = False):
         self.checkpoint_connector.save_checkpoint(filepath, weights_only)
 
+    @property
+    def model(self) -> Any:
+        """
+        The LightningModule, but possibly wrapped into DataParallel or DistributedDataParallel.
+        To access the pure LightningModule, use
+        :meth:`~pytorch_lightning.trainer.trainer.Trainer.lightning_module` instead.
+        """
+        return self.accelerator.model
+
+    @model.setter
+    def model(self, model: torch.nn.Module):
+        """
+        Setter for the model, pass-through to accelerator and plugin where the model reference is stored.
+        Used by the Tuner to reset the state of Trainer and Accelerator.
+
+        Args:
+            model: The LightningModule, possibly wrapped into DataParallel or DistributedDataParallel, depending
+                on the backend.
+        """
+        self.accelerator.model = model
+
     def get_model(self):
-        return self.model_connector.get_model()
+        # TODO: rename this to lightning_module (see training type plugin)
+        # backward compatible
+        return self.lightning_module
 
     @property
     def lightning_optimizers(self):
@@ -280,11 +370,55 @@ def lightning_optimizers(self):
             self.convert_to_lightning_optimizers()
         return self._lightning_optimizers
 
+    @property
+    def lightning_module(self):
+        return self.training_type_plugin.lightning_module
+
+    @property
+    def optimizers(self):
+        return self.accelerator.optimizers
+
+    @optimizers.setter
+    def optimizers(self, new_optims):
+        self.accelerator.optimizers = new_optims
+
+    @property
+    def lr_schedulers(self):
+        return self.accelerator.lr_schedulers
+
+    @lr_schedulers.setter
+    def lr_schedulers(self, new_schedulers):
+        self.accelerator.lr_schedulers = new_schedulers
+
+    @property
+    def optimizer_frequencies(self):
+        return self.accelerator.optimizer_frequencies
+
+    @optimizer_frequencies.setter
+    def optimizer_frequencies(self, new_freqs):
+        self.accelerator.optimizer_frequencies = new_freqs
+
+    @property
+    def amp_backend(self):
+        return self.accelerator.amp_backend
+
+    @property
+    def precision(self):
+        return self.accelerator.precision
+
+    @property
+    def scaler(self):
+        return self.accelerator.scaler
+
+    # TODO: refactor this so that it can be done in LightningOptimizer
     def __getstate__(self):
         # remove lightning_optimizers
         self._lightning_optimizers = None
         return self.__dict__
 
+    def __setstate__(self, state):
+        self.__dict__ = state
+
     @property
     def require_distributed_sampler(self):
         if self.accelerator_backend is not None:
@@ -296,8 +430,9 @@ def require_distributed_sampler(self):
     @property
     def distributed_sampler_kwargs(self):
         if self.accelerator_backend is not None:
-            return self.accelerator_backend.distributed_sampler_kwargs
+            return self.training_type_plugin.distributed_sampler_kwargs
 
+        # TODO: make sure the cases below are handled by the training_type_plugin
         if self._device_type == DeviceType.TPU:
             kwargs = dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal())
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 184f5c41b878b..1239ac4913ff5 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Trainer to automate the training."""
-
 import warnings
 from itertools import count
 from pathlib import Path
@@ -22,14 +21,14 @@
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.legacy.accelerator import Accelerator
-from pytorch_lightning.accelerators.legacy.accelerator_connector import AcceleratorConnector
+from pytorch_lightning.accelerators import Accelerator
+from pytorch_lightning.accelerators.accelerator_connector import BackendConnector
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.loggers import LightningLoggerBase
-from pytorch_lightning.plugins.legacy.plugin_connector import PluginConnector
 from pytorch_lightning.profiler import BaseProfiler
 from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin
 from pytorch_lightning.trainer.configuration_validator import ConfigValidator
@@ -41,7 +40,6 @@
 from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector
 from pytorch_lightning.trainer.connectors.model_connector import ModelConnector
 from pytorch_lightning.trainer.connectors.optimizer_connector import OptimizerConnector
-from pytorch_lightning.trainer.connectors.precision_connector import PrecisionConnector
 from pytorch_lightning.trainer.connectors.profiler_connector import ProfilerConnector
 from pytorch_lightning.trainer.connectors.slurm_connector import SLURMConnector
 from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
@@ -297,20 +295,23 @@ def __init__(
                 reload when reaching the minimum length of datasets.
         """
         super().__init__()
-        self._device_type = DeviceType.CPU
-        self._distrib_type = None
         self._running_stage = None
         self._predicting = False
 
+        distributed_backend = distributed_backend or accelerator
+
         # init connectors
         self.dev_debugger = InternalDebugger(self)
         self.config_validator = ConfigValidator(self)
         self.data_connector = DataConnector(self)
         self.optimizer_connector = OptimizerConnector(self)
-        self.accelerator_connector = AcceleratorConnector(self)
-        self.logger_connector = LoggerConnector(self)
+
+        self.accelerator_connector = BackendConnector(
+            num_processes, tpu_cores, distributed_backend, auto_select_gpus, gpus, num_nodes, sync_batchnorm, benchmark,
+            replace_sampler_ddp, deterministic, precision, amp_backend, amp_level, plugins
+        )
+        self.logger_connector = LoggerConnector(self, log_gpu_memory)
         self.model_connector = ModelConnector(self)
-        self.precision_connector = PrecisionConnector(self)
         self.callback_connector = CallbackConnector(self)
         self.debugging_connector = DebuggingConnector(self)
         self.training_tricks_connector = TrainingTricksConnector(self)
@@ -318,13 +319,11 @@ def __init__(
         self.checkpoint_connector = CheckpointConnector(self)
         self.slurm_connector = SLURMConnector(self)
         self.tuner = Tuner(self)
-        self.accelerator_backend = None
         self.evaluation_loop = EvaluationLoop(self)
         self.train_loop = TrainLoop(self, multiple_trainloader_mode)
-        self.plugin_connector = PluginConnector(self)
 
         # training state
-        self.model = None
+        self.weights_summary = weights_summary
         self.shown_warnings = set()
 
         # init callbacks
@@ -355,22 +354,6 @@ def __init__(
             gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan
         )
 
-        # init accelerator related flags
-        self.accelerator_connector.on_trainer_init(
-            num_processes,
-            tpu_cores,
-            accelerator,
-            distributed_backend,
-            auto_select_gpus,
-            gpus,
-            num_nodes,
-            log_gpu_memory,
-            sync_batchnorm,
-            benchmark,
-            replace_sampler_ddp,
-            deterministic,
-        )
-
         # init train loop related flags
         # TODO: remove in 1.3.0
         if automatic_optimization is None:
@@ -415,12 +398,6 @@ def __init__(
             fast_dev_run,
         )
 
-        # set precision
-        self.precision_connector.on_trainer_init(precision, amp_level, amp_backend)
-
-        # last thing are the plugins which override whatever the trainer used by default
-        self.plugin_connector.on_trainer_init(plugins)
-
         # Callback system
         self.on_init_end()
 
@@ -431,17 +408,6 @@ def setup_trainer(self, model: LightningModule):
         Args:
             model: The model to run sanity test on.
         """
-        # --------------------------
-        # Setup??
-        # --------------------------
-        ref_model = self.get_model()
-
-        # set the ranks and devices
-        self.accelerator_backend.dist.rank = self.global_rank
-        self.accelerator_backend.dist.device = ref_model.device
-
-        # set local properties on the model
-        self.model_connector.copy_trainer_model_properties(model)
 
         # init amp. Must be done here instead of __init__ to allow ddp to work
         if self.amp_backend == AMPType.NATIVE and self.precision == 16 and self._device_type != DeviceType.TPU:
@@ -450,20 +416,10 @@ def setup_trainer(self, model: LightningModule):
         # log hyper-parameters
         if self.logger is not None:
             # save exp to get started (this is where the first experiment logs are written)
-            self.logger.log_hyperparams(ref_model.hparams_initial)
-            self.logger.log_graph(ref_model)
+            self.logger.log_hyperparams(model.hparams_initial)
+            self.logger.log_graph(model)
             self.logger.save()
 
-        # wait for all to join if on distributed
-        self.accelerator_backend.barrier("setup_trainer")
-
-        # register auto-resubmit when on SLURM
-        self.slurm_connector.register_slurm_signal_handlers()
-
-        # track model now.
-        # if cluster resets state, the model will update with the saved weights
-        self.model = model
-
     def fit(
         self,
         model: LightningModule,
@@ -490,6 +446,9 @@ def fit(
         self._state = TrainerState.RUNNING
         self._set_wide_running_stage(RunningStage.TRAINING)
 
+        # set local properties on the model
+        self.model_connector.copy_trainer_model_properties(model)
+
         # ----------------------------
         # LINK DATA
         # ----------------------------
@@ -502,25 +461,32 @@ def fit(
         # ----------------------------
         # SET UP TRAINING
         # ----------------------------
-        self.accelerator_backend = self.accelerator_connector.select_accelerator()
+        self.call_setup_hook(model)
         self.call_hook("on_before_accelerator_backend_setup", model)
-        self.accelerator_backend.setup(model)
-
-        # ----------------------------
-        # INSPECT THESE FOR MAIN LOOPS
-        # ----------------------------
-        # assign training and eval functions... inspect these to see the train and eval loops :)
-        self.accelerator_backend.train_loop = self.train
-        self.accelerator_backend.validation_loop = self.run_evaluation
-        self.accelerator_backend.test_loop = self.run_evaluation
+        self.accelerator_backend.setup(self, model)
+        self.setup_trainer(model)
 
         # ----------------------------
         # TRAIN
         # ----------------------------
         # hook
-        self.call_hook('on_fit_start')
-        results = self.accelerator_backend.train()
+        self.call_hook("on_fit_start")
+
+        # plugin will setup training (e.g. ddp will launch child processes)
+        # TODO: the old setup is now called "pre_training", where should this hook be called now?
+        self.training_type_plugin.pre_training()
+        self.precision_plugin.pre_training()
+
+        # double dispatch: let the plugin initiate the training/test loop.
+        if self.testing:
+            self.training_type_plugin.start_testing(self)
+        else:
+            self.training_type_plugin.start_training(self)
+
+        self.precision_plugin.post_training()
+        self.training_type_plugin.post_training()
         self.accelerator_backend.teardown()
+        results = self.training_type_plugin.results
 
         # ----------------------------
         # POST-Training CLEAN UP
@@ -535,7 +501,6 @@ def fit(
 
         # return 1 when finished
         # used for testing or when we need to know that training succeeded
-
         if self._state != TrainerState.INTERRUPTED:
             self._state = TrainerState.FINISHED
 
@@ -566,7 +531,45 @@ def _set_wide_running_stage(self, stage):
 
         self._running_stage = stage
 
+    def _pre_training_routine(self):
+        # wait for all to join if on distributed
+        self.accelerator.training_type_plugin.barrier("setup_training")
+
+        # register auto-resubmit when on SLURM
+        self.slurm_connector.register_slurm_signal_handlers()
+
+        # --------------------------
+        # Pre-train
+        # --------------------------
+        # on pretrain routine start
+        ref_model = self.get_model()
+
+        self.on_pretrain_routine_start(ref_model)
+        if self.is_function_implemented("on_pretrain_routine_start"):
+            ref_model.on_pretrain_routine_start()
+
+        # print model summary
+        if self.is_global_zero and self.weights_summary is not None and not self.testing:
+            if self.weights_summary in ModelSummary.MODES:
+                ref_model.summarize(mode=self.weights_summary)
+            else:
+                raise MisconfigurationException("weights_summary can be None, " + ", ".join(ModelSummary.MODES))
+
+        # restore training and model before hpc is called
+        self.checkpoint_connector.restore_weights()
+
+        # on pretrain routine end
+        self.on_pretrain_routine_end(ref_model)
+        if self.is_function_implemented("on_pretrain_routine_end"):
+            ref_model.on_pretrain_routine_end()
+
     def train(self):
+
+        self._pre_training_routine()
+
+        if not self.is_global_zero and self.progress_bar_callback is not None:
+            self.progress_bar_callback.disable()
+
         self.run_sanity_check(self.get_model())
 
         # set stage for logging
@@ -609,11 +612,15 @@ def train(self):
                 if self.should_stop:
                     if met_min_epochs and met_min_steps:
                         return
-                    log.info(
-                        'Trainer was signaled to stop but required minimum epochs'
-                        f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
-                        ' not been met. Training will continue...'
-                    )
+                    else:
+                        log.info(
+                            'Trainer was signaled to stop but required minimum epochs'
+                            f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
+                            ' not been met. Training will continue...'
+                        )
+
+            # hook
+            self.train_loop.on_train_end()
 
         except KeyboardInterrupt:
             rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')
@@ -721,6 +728,7 @@ def run_evaluation(self, max_batches=None, on_epoch=False):
 
         # enable train mode again
         self.evaluation_loop.on_evaluation_model_train()
+
         torch.set_grad_enabled(True)
 
         return eval_loop_results, deprecated_eval_results
@@ -739,6 +747,9 @@ def track_output_for_epoch_end(self, outputs, output):
         return outputs
 
     def run_test(self):
+        if not self.is_global_zero and self.progress_bar_callback is not None:
+            self.progress_bar_callback.disable()
+
         # only load test dataloader for testing
         # self.reset_test_dataloader(ref_model)
         with self.profiler.profile("run_test_evaluation"):
@@ -863,8 +874,8 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
                     f'specify a path for a checkpoint .test(ckpt_path=PATH)'
                 )
                 return {}
-            if self.accelerator_backend is not None and not self._device_type == DeviceType.TPU:
-                self.accelerator_backend.barrier()
+            if not self._device_type == DeviceType.TPU:
+                self.training_type_plugin.barrier()
 
             ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt['state_dict'])
@@ -875,7 +886,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
 
         # run tests
         self.tested_ckpt_path = ckpt_path
-        self.model = model
         results = self.fit(model)
 
         # teardown
@@ -893,7 +903,6 @@ def __test_given_model(self, model, test_dataloaders):
 
         # run test
         # sets up testing so we short circuit to eval
-        self.model = model
         results = self.fit(model)
 
         # teardown
@@ -1041,16 +1050,6 @@ def call_hook(self, hook_name, *args, **kwargs):
             self._cache_logged_metrics()
         return output
 
-    @staticmethod
-    def available_plugins():
-        """
-        List of all available plugins that can be string arguments to the trainer.
-
-        Returns:
-            List of all available plugins that are supported as string arguments.
-        """
-        return PluginConnector.available_plugins()
-
     @property
     def training(self) -> bool:
         return self._running_stage == RunningStage.TRAINING
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 778e1e7e1051e..03a72eb71ab84 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -22,6 +22,7 @@
 from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.core.step_result import Result
+from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.trainer.states import RunningStage, TrainerState
 from pytorch_lightning.trainer.supporters import Accumulator, TensorRunningAccum
 from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType, parsing
@@ -100,13 +101,6 @@ def should_skip_training(self):
         return should_by_epoch or self.trainer.num_training_batches == 0
 
     def on_train_start(self):
-        # clear cache before training
-        if self.trainer._device_type == DeviceType.GPU and self.trainer.root_gpu is not None:
-            # use context because of:
-            # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
-            with torch.cuda.device(f"cuda:{self.trainer.root_gpu}"):
-                torch.cuda.empty_cache()
-
         # hook
         self.trainer.call_hook("on_train_start")
 
@@ -114,9 +108,6 @@ def on_train_start(self):
         self.trainer.profile_connector.on_train_start(self.trainer)
 
     def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule):
-        # bind logger and other properties
-        self.trainer.model_connector.copy_trainer_model_properties(model)
-
         # clean hparams
         if hasattr(model, "hparams"):
             parsing.clean_namespace(model.hparams)
@@ -130,32 +121,6 @@ def setup_fit(self, model, train_dataloader, val_dataloaders, datamodule):
         # attach model log function to callback
         self.trainer.callback_connector.attach_model_logging_functions(model)
 
-    def setup_training(self):
-        """
-        Sanity check a few things before starting actual training.
-        """
-        # --------------------------
-        # Pre-train
-        # --------------------------
-        ref_model = self.trainer.get_model()
-
-        # on pretrain routine start
-        self.trainer.on_pretrain_routine_start(ref_model)
-        if self.trainer.is_function_implemented("on_pretrain_routine_start"):
-            ref_model.on_pretrain_routine_start()
-
-        # print model summary
-        if self.trainer.is_global_zero:
-            ref_model.summarize(mode=self.trainer.weights_summary)
-
-        # restore training state and model weights before hpc is called
-        self.trainer.checkpoint_connector.restore_weights()
-
-        # on pretrain routine end
-        self.trainer.on_pretrain_routine_end(ref_model)
-        if self.trainer.is_function_implemented("on_pretrain_routine_end"):
-            ref_model.on_pretrain_routine_end()
-
     def on_train_end(self):
         if self._teardown_already_run:
             return
@@ -171,8 +136,10 @@ def on_train_end(self):
         # hook
         self.trainer.call_hook("on_train_end")
 
+        # todo: TPU 8 cores hangs in flush with TensorBoard. Might do for all loggers.
+        # It might be related to xla tensors blocked when moving the cpu
         # kill loggers
-        if self.trainer.logger is not None:
+        if self.trainer.logger is not None and self.trainer.training_type_plugin.should_finalize:
             self.trainer.logger.finalize("success")
 
         # summarize profile results
@@ -329,6 +296,8 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
             model_ref._results = Result()
             with self.trainer.profiler.profile("training_step"):
                 training_step_output = self.trainer.accelerator_backend.training_step(args)
+                self.trainer.accelerator_backend.post_training_step()
+
             self.trainer.logger_connector.cache_logged_metrics()
 
             self._check_training_step_output(training_step_output)
@@ -503,12 +472,15 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_
     def on_before_zero_grad(self, optimizer):
         self.trainer.call_hook('on_before_zero_grad', optimizer)
 
+    def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
+        self.trainer.accelerator_backend.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
+
     def track_and_norm_grad(self, optimizer):
         # track gradient norms
         grad_norm_dic = self._track_gradient_norm()
 
         # clip gradients
-        self.trainer.accelerator_backend.clip_gradients(optimizer)
+        self.trainer.accelerator_backend.clip_gradients(optimizer, self.trainer.gradient_clip_val)
         self._cur_grad_norm_dict = grad_norm_dic
 
     def _track_gradient_norm(self):
@@ -742,7 +714,7 @@ def train_step_and_backward_closure():
         return result
 
     @contextmanager
-    def block_ddp_sync_behaviour(self):
+    def block_ddp_sync_behaviour(self, should_block_sync: bool = False):
         """
         automatic_optimization = True
         Blocks ddp sync gradients behaviour on backwards pass.
@@ -756,8 +728,12 @@ def block_ddp_sync_behaviour(self):
             context manager with sync behaviour off
 
         """
-        if self.trainer.accelerator_backend is not None and self.automatic_optimization:
-            yield self.trainer.accelerator_backend.block_ddp_plugin_sync_behaviour()
+        if (
+            isinstance(self.trainer.training_type_plugin, ParallelPlugin)
+            and (self.automatic_optimization or should_block_sync)
+        ):
+            with self.trainer.training_type_plugin.block_backward_sync():
+                yield None
         else:
             yield None
 
@@ -798,7 +774,8 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
             self._curr_step_result = result
 
             if result is None:
-                self.warning_cache.warn("training_step returned None if it was on purpose, ignore this warning...")
+                if self.automatic_optimization:
+                    self.warning_cache.warn("training_step returned None if it was on purpose, ignore this warning...")
                 return None
 
             if not self._skip_backward and self.trainer.train_loop.automatic_optimization:
@@ -824,12 +801,14 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer,
     def backward(self, result, optimizer, opt_idx, *args, **kwargs):
         self.trainer.dev_debugger.track_event("backward_call")
 
+        should_accumulate = self.should_accumulate()
+
         # backward can be called manually in the training loop
         if isinstance(result, torch.Tensor):
-            self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, *args, **kwargs)
+            self.trainer.accelerator_backend.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs)
         else:
             result.closure_loss = self.trainer.accelerator_backend.backward(
-                result.closure_loss, optimizer, opt_idx, *args, **kwargs
+                result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
             )
 
         if not self.should_accumulate():
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 94b08029b92c1..889ed96f43679 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -37,6 +37,7 @@
     _OMEGACONF_AVAILABLE,
     _RPC_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_6,
+    _TORCH_GREATER_EQUAL_1_7,
     _TORCH_LOWER_EQUAL_1_4,
     _TORCH_QUANTIZE_AVAILABLE,
     _TORCHTEXT_AVAILABLE,
diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py
index fbed98ae2baa7..f20b978ebd8b6 100644
--- a/pytorch_lightning/utilities/device_parser.py
+++ b/pytorch_lightning/utilities/device_parser.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, MutableSequence, Optional, Union
+from typing import Any, List, MutableSequence, Optional, Tuple, Union
 
 import torch
 
@@ -145,9 +145,9 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]:
     return gpus
 
 
-def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int]]) -> Optional[List[int]]:
+def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, ...]]) -> Optional[List[int]]:
     assert gpus is not None
-    if isinstance(gpus, MutableSequence):
+    if isinstance(gpus, (MutableSequence, tuple)):
         return list(gpus)
 
     # must be an int
@@ -176,7 +176,7 @@ def _check_data_type(device_ids: Any) -> None:
         device_ids: gpus/tpu_cores parameter as passed to the Trainer
     """
     if device_ids is not None and \
-            (not isinstance(device_ids, (int, str, MutableSequence)) or isinstance(device_ids, bool)):
+            (not isinstance(device_ids, (int, str, MutableSequence, tuple)) or isinstance(device_ids, bool)):
         raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.")
 
 
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index f6c0bf1d6cc54..c7796b433f1ed 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -63,6 +63,9 @@ class DistributedType(LightningEnum):
     DDP2 = 'ddp2'
     DDP_SPAWN = 'ddp_spawn'
     HOROVOD = 'horovod'
+    DDP_SHARDED = 'ddp_sharded'
+    DDP_SHARDED_SPAWN = 'ddp_sharded_spawn'
+    RPC_SEQUENTIAL_PLUGIN = 'rpc_sequential'
 
 
 class DeviceType(LightningEnum):
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 8ebcb570a394f..4d1b38eaf5949 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -49,9 +49,9 @@ def _compare_version(package: str, op, version) -> bool:
 
 
 _IS_WINDOWS = platform.system() == "Windows"
-
 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0")
 _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
+_TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
 _TORCH_QUANTIZE_AVAILABLE = _module_available('torch.ops.quantized')
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
diff --git a/tests/accelerators/legacy/test_accelerator_connector.py b/tests/accelerators/legacy/test_accelerator_connector.py
index 12bfe3a193a8a..c0f6c0c0a5b9b 100644
--- a/tests/accelerators/legacy/test_accelerator_connector.py
+++ b/tests/accelerators/legacy/test_accelerator_connector.py
@@ -16,102 +16,66 @@
 from unittest import mock
 
 import pytest
+import torch
 
-from pytorch_lightning import accelerators, Trainer
-from pytorch_lightning.accelerators import Accelerator
+from pytorch_lightning import Trainer
+from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.cpu import CPUAccelerator
+from pytorch_lightning.accelerators.gpu import GPUAccelerator
 from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin, DDPSpawnPlugin, PrecisionPlugin, SingleDevicePlugin
 from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
-from pytorch_lightning.utilities import DistributedType
 from tests.helpers.boring_model import BoringModel
 
 
 def test_accelerator_choice_cpu(tmpdir):
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, accelerators.CPUAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-
-    model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         fast_dev_run=True,
-        callbacks=[CB()],
     )
-    trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, SingleDevicePlugin)
 
 
 def test_accelerator_choice_ddp_cpu(tmpdir):
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSpawnAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            raise SystemExit()
-
-    model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_cpu',
-        num_processes=2,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
 
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch('torch.cuda.device_count', return_value=2)
-def test_accelerator_choice_ddp(tmpdir):
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            raise SystemExit()
-
-    model = BoringModel()
+@mock.patch('torch.cuda.is_available', return_value=True)
+def test_accelerator_choice_ddp(cuda_available_mock, device_count_mock):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp',
         gpus=1,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
 
 @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @mock.patch('torch.cuda.device_count', return_value=2)
-def test_accelerator_choice_ddp_spawn(tmpdir):
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPSpawnAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            raise SystemExit()
-
-    model = BoringModel()
+@mock.patch('torch.cuda.is_available', return_value=True)
+def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
     trainer = Trainer(
         fast_dev_run=True,
         accelerator='ddp_spawn',
         gpus=1,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, GPUAccelerator)
+    assert isinstance(trainer.training_type_plugin, DDPSpawnPlugin)
+    assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @mock.patch.dict(
     os.environ, {
         "CUDA_VISIBLE_DEVICES": "0,1",
@@ -121,17 +85,18 @@ def on_fit_start(self, trainer, pl_module):
         "SLURM_LOCALID": "10"
     }
 )
-@mock.patch('torch.cuda.device_count', return_value=2)
-def test_accelerator_choice_ddp_slurm(tmpdir):
+def test_accelerator_choice_ddp_slurm():
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
+            assert trainer.training_type_plugin.task_idx == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -146,6 +111,7 @@ def on_fit_start(self, trainer, pl_module):
         trainer.fit(model)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU")
 @mock.patch.dict(
     os.environ, {
         "CUDA_VISIBLE_DEVICES": "0,1",
@@ -157,17 +123,18 @@ def on_fit_start(self, trainer, pl_module):
     }
 )
 @mock.patch('torch.cuda.device_count', return_value=2)
-def test_accelerator_choice_ddp2_slurm(tmpdir):
+def test_accelerator_choice_ddp2_slurm(device_count_mock):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type == DistributedType.DDP2
-            assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
-
+            assert trainer.use_ddp2
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
+            assert trainer.training_type_plugin.task_idx == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -182,25 +149,20 @@ def on_fit_start(self, trainer, pl_module):
         trainer.fit(model)
 
 
-@mock.patch.dict(
-    os.environ, {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "WORLD_SIZE": "2",
-        "LOCAL_RANK": "10",
-        "NODE_RANK": "0",
-    }
-)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU")
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", "LOCAL_RANK": "10", "NODE_RANK": "0"})
 @mock.patch('torch.cuda.device_count', return_value=2)
-def test_accelerator_choice_ddp_te(tmpdir):
+def test_accelerator_choice_ddp_te(device_count_mock):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
+            assert trainer.training_type_plugin.task_idx == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -215,25 +177,20 @@ def on_fit_start(self, trainer, pl_module):
         trainer.fit(model)
 
 
-@mock.patch.dict(
-    os.environ, {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "WORLD_SIZE": "2",
-        "LOCAL_RANK": "10",
-        "NODE_RANK": "0",
-    }
-)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU")
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", "LOCAL_RANK": "10", "NODE_RANK": "0"})
 @mock.patch('torch.cuda.device_count', return_value=2)
-def test_accelerator_choice_ddp2_te(tmpdir):
+def test_accelerator_choice_ddp2_te(device_count_mock):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type == DistributedType.DDP2
-            assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
+            assert trainer.use_ddp2
+            assert isinstance(trainer.accelerator_backend, GPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDP2Plugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
+            assert trainer.training_type_plugin.task_idx == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -254,17 +211,17 @@ def on_fit_start(self, trainer, pl_module):
     "NODE_RANK": "0",
 })
 @mock.patch('torch.cuda.device_count', return_value=0)
-def test_accelerator_choice_ddp_cpu_te(tmpdir):
+def test_accelerator_choice_ddp_cpu_te(device_count_mock):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment)
-            assert trainer.accelerator_backend.task_idx == 10
-            assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx
-
+            assert trainer.use_ddp
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)
+            assert trainer.training_type_plugin.cluster_environment.local_rank() == 10
+            assert trainer.training_type_plugin.task_idx == 10
             raise SystemExit()
 
     model = BoringModel()
@@ -289,14 +246,17 @@ def on_fit_start(self, trainer, pl_module):
     }
 )
 @mock.patch('torch.cuda.device_count', return_value=0)
-def test_accelerator_choice_ddp_cpu_slurm(tmpdir):
+def test_accelerator_choice_ddp_cpu_slurm(device_count_mock):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment)
+            assert trainer.use_ddp
+            assert trainer.accelerator_connector.is_slurm_managing_tasks
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+            assert trainer.training_type_plugin.task_idx == 0
             raise SystemExit()
 
     model = BoringModel()
@@ -321,7 +281,7 @@ def on_fit_start(self, trainer, pl_module):
     }
 )
 @mock.patch('torch.cuda.device_count', return_value=0)
-def test_accelerator_choice_ddp_cpu_custom_cluster(tmpdir):
+def test_accelerator_choice_ddp_cpu_custom_cluster(device_count_mock):
     """
     Test that we choose the custom cluster even when SLURM or TE flags are around
     """
@@ -334,9 +294,10 @@ def master_address(self):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN)
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
-            assert isinstance(trainer.accelerator_backend.cluster_environment, CustomCluster)
+            assert trainer.use_ddp
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert isinstance(trainer.training_type_plugin.cluster_environment, CustomCluster)
             raise SystemExit()
 
     model = BoringModel()
@@ -362,29 +323,29 @@ def on_fit_start(self, trainer, pl_module):
     }
 )
 @mock.patch('torch.cuda.device_count', return_value=0)
-def test_custom_accelerator(tmpdir):
+def test_custom_accelerator(device_count_mock):
 
     class Accel(Accelerator):
+        pass
 
-        def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True) -> None:
-            pass
-
-    class CB(Callback):
+    class Prec(PrecisionPlugin):
+        pass
 
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, Accel)
-            raise SystemExit()
+    class TrainTypePlugin(SingleDevicePlugin):
+        pass
 
-    model = BoringModel()
+    accelerator = Accel(
+        training_type_plugin=TrainTypePlugin(device=torch.device("cpu")),
+        precision_plugin=Prec(),
+    )
     trainer = Trainer(
+        accelerator=accelerator,
         fast_dev_run=True,
-        accelerator=Accel(),
         num_processes=2,
-        callbacks=[CB()],
     )
-
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    assert isinstance(trainer.accelerator_backend, Accel)
+    assert isinstance(trainer.training_type_plugin, TrainTypePlugin)
+    assert isinstance(trainer.precision_plugin, Prec)
 
 
 @mock.patch.dict(
@@ -397,12 +358,14 @@ def on_fit_start(self, trainer, pl_module):
     }
 )
 @mock.patch('torch.cuda.device_count', return_value=0)
-def test_dist_backend_accelerator_mapping(tmpdir):
+def test_dist_backend_accelerator_mapping(device_count_mock):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator)
+            assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+            assert isinstance(trainer.training_type_plugin, DDPPlugin)
+            assert trainer.training_type_plugin.task_idx == 0
             raise SystemExit()
 
     model = BoringModel()
diff --git a/tests/accelerators/legacy/test_ddp_spawn.py b/tests/accelerators/legacy/test_ddp_spawn.py
index 106260bbf3dd0..1e17947fe6eb9 100644
--- a/tests/accelerators/legacy/test_ddp_spawn.py
+++ b/tests/accelerators/legacy/test_ddp_spawn.py
@@ -25,7 +25,6 @@
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
-    """Make sure DDP works. with early stopping"""
     tutils.set_random_master_port()
 
     trainer_options = dict(
diff --git a/tests/accelerators/legacy/test_multi_nodes_gpu.py b/tests/accelerators/legacy/test_multi_nodes_gpu.py
index 8f6396f485fdc..20faa100016e9 100644
--- a/tests/accelerators/legacy/test_multi_nodes_gpu.py
+++ b/tests/accelerators/legacy/test_multi_nodes_gpu.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import sys
+from unittest import mock
 
 import pytest
 import torch
@@ -68,11 +69,11 @@ def validation_step(self, batch, batch_idx):
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
+@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test__validation_step__log(tmpdir):
     """
     Tests that validation_step can log
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     class TestModel(BoringModel):
 
diff --git a/tests/accelerators/legacy/test_tpu_backend.py b/tests/accelerators/legacy/test_tpu_backend.py
index 864a250eb7bef..8e20cefe3b3d5 100644
--- a/tests/accelerators/legacy/test_tpu_backend.py
+++ b/tests/accelerators/legacy/test_tpu_backend.py
@@ -26,7 +26,6 @@
 @pl_multi_process_test
 def test_resume_training_on_cpu(tmpdir):
     """ Checks if training can be resumed from a saved checkpoint on CPU"""
-
     # Train a model on TPU
     model = BoringModel()
     trainer = Trainer(
@@ -61,7 +60,6 @@ def test_if_test_works_after_train(tmpdir):
 
     # Train a model on TPU
     model = BoringModel()
-    trainer = Trainer(checkpoint_callback=True, max_epochs=1, tpu_cores=8, default_root_dir=tmpdir)
+    trainer = Trainer(max_epochs=1, tpu_cores=8, default_root_dir=tmpdir, fast_dev_run=True)
     trainer.fit(model)
-
-    assert trainer.test() == 1
+    assert trainer.test(model) == 1
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 061c001389e40..060d42fd5edc3 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -53,8 +53,8 @@ def test_trainer_callback_system(torch_save):
     assert callback_mock.method_calls == [
         call.on_init_start(trainer),
         call.on_init_end(trainer),
-        call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'fit'),
+        call.on_before_accelerator_backend_setup(trainer, model),
         call.on_fit_start(trainer, model),
         call.on_pretrain_routine_start(trainer, model),
         call.on_pretrain_routine_end(trainer, model),
@@ -108,8 +108,8 @@ def test_trainer_callback_system(torch_save):
     assert callback_mock.method_calls == [
         call.on_init_start(trainer),
         call.on_init_end(trainer),
-        call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'test'),
+        call.on_before_accelerator_backend_setup(trainer, model),
         call.on_fit_start(trainer, model),
         call.on_test_start(trainer, model),
         call.on_test_epoch_start(trainer, model),
diff --git a/tests/callbacks/test_finetuning_callback.py b/tests/callbacks/test_finetuning_callback.py
index e071ed3436dea..503955ac875ac 100644
--- a/tests/callbacks/test_finetuning_callback.py
+++ b/tests/callbacks/test_finetuning_callback.py
@@ -19,6 +19,7 @@
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
 from pytorch_lightning.callbacks import BackboneFinetuning, BaseFinetuning
+from pytorch_lightning.callbacks.base import Callback
 from tests.helpers import BoringModel, RandomDataset
 
 
@@ -215,3 +216,31 @@ def __init__(self):
             assert torch.equal(optimizer.param_groups[2]["params"][0], model.backbone[2].weight)
             assert torch.equal(optimizer.param_groups[2]["params"][1], model.backbone[3].weight)
             assert torch.equal(optimizer.param_groups[2]["params"][2], model.backbone[4].weight)
+
+
+def test_on_before_accelerator_backend_setup(tmpdir):
+    """
+    `on_before_accelerator_backend_setup` hook is used by finetuning callbacks to freeze the model before
+    before configure_optimizers function call.
+    """
+
+    class TestCallback(Callback):
+
+        def on_before_accelerator_backend_setup(self, trainer, pl_module):
+            pl_module.on_before_accelerator_backend_setup_called = True
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.on_before_accelerator_backend_setup_called = False
+
+        def configure_optimizers(self):
+            assert self.on_before_accelerator_backend_setup_called
+            return super().configure_optimizers()
+
+    model = TestModel()
+    callback = TestCallback()
+
+    trainer = Trainer(default_root_dir=tmpdir, callbacks=[callback], fast_dev_run=True)
+    trainer.fit(model)
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index c9fe92970c5ac..91db602690e94 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -521,7 +521,6 @@ def test_ckpt_metric_names(tmpdir):
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 def test_default_checkpoint_behavior(tmpdir):
     seed_everything(1234)
-    os.environ['PL_DEV_DEBUG'] = '1'
 
     model = LogInTwoMethods()
     trainer = Trainer(
diff --git a/tests/conftest.py b/tests/conftest.py
index 8dd8fdd251912..9bc607e119451 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 import sys
 import threading
 from functools import partial, wraps
@@ -21,6 +21,16 @@
 import torch.multiprocessing as mp
 
 
+@pytest.fixture(scope="function", autouse=True)
+def restore_env_variables():
+    """ Ensures that environment variables set during the test do not leak out. """
+    env_backup = os.environ.copy()
+    yield
+    # restore environment as it was before running the test
+    os.environ.clear()
+    os.environ.update(env_backup)
+
+
 def pytest_configure(config):
     config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn")
 
@@ -44,7 +54,6 @@ def tmpdir_server(tmpdir):
     else:
         # unfortunately SimpleHTTPRequestHandler doesn't accept the directory arg in python3.6
         # so we have to hack it like this
-        import os
 
         class Handler(SimpleHTTPRequestHandler):
 
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index a83a6a41c9287..8cf1f0a9d1ffb 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -14,23 +14,26 @@
 import pickle
 from argparse import ArgumentParser
 from typing import Any, Dict
-from unittest.mock import MagicMock
+from unittest import mock
+from unittest.mock import PropertyMock
 
 import pytest
 import torch
 import torch.nn.functional as F
 
 from pytorch_lightning import LightningDataModule, Trainer
-from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.states import TrainerState
+from pytorch_lightning.utilities.model_helpers import is_overridden
 from tests.helpers import BoringDataModule, BoringModel
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.simple_models import ClassificationModel
 from tests.helpers.utils import reset_seed, set_random_master_port
 
 
-def test_can_prepare_data(tmpdir):
+@mock.patch("pytorch_lightning.trainer.trainer.Trainer.node_rank", new_callable=PropertyMock)
+@mock.patch("pytorch_lightning.trainer.trainer.Trainer.local_rank", new_callable=PropertyMock)
+def test_can_prepare_data(local_rank, node_rank):
 
     dm = BoringDataModule()
     trainer = Trainer()
@@ -40,33 +43,36 @@ def test_can_prepare_data(tmpdir):
     # prepare_data_per_node = True
     # local rank = 0   (True)
     trainer.prepare_data_per_node = True
-    trainer.local_rank = 0
+
+    local_rank.return_value = 0
+    assert trainer.local_rank == 0
     assert trainer.data_connector.can_prepare_data()
 
     # local rank = 1   (False)
-    trainer.local_rank = 1
+    local_rank.return_value = 1
+    assert trainer.local_rank == 1
     assert not trainer.data_connector.can_prepare_data()
 
     # prepare_data_per_node = False (prepare across all nodes)
     # global rank = 0   (True)
     trainer.prepare_data_per_node = False
-    trainer.node_rank = 0
-    trainer.local_rank = 0
+    node_rank.return_value = 0
+    local_rank.return_value = 0
     assert trainer.data_connector.can_prepare_data()
 
     # global rank = 1   (False)
-    trainer.node_rank = 1
-    trainer.local_rank = 0
+    node_rank.return_value = 1
+    local_rank.return_value = 0
     assert not trainer.data_connector.can_prepare_data()
-    trainer.node_rank = 0
-    trainer.local_rank = 1
+    node_rank.return_value = 0
+    local_rank.return_value = 1
     assert not trainer.data_connector.can_prepare_data()
 
     # 2 dm
     # prepar per node = True
     # local rank = 0 (True)
     trainer.prepare_data_per_node = True
-    trainer.local_rank = 0
+    local_rank.return_value = 0
 
     # is_overridden prepare data = True
     # has been called
@@ -416,7 +422,8 @@ def test_step_end(self, outputs):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_dm_transfer_batch_to_device(tmpdir):
+@mock.patch("pytorch_lightning.accelerators.accelerator.Accelerator.lightning_module", new_callable=PropertyMock)
+def test_dm_transfer_batch_to_device(get_module_mock):
 
     class CustomBatch:
 
@@ -441,11 +448,10 @@ def transfer_batch_to_device(self, data, device):
 
     trainer = Trainer(gpus=1)
     # running .fit() would require us to implement custom data loaders, we mock the model reference instead
-    trainer.get_model = MagicMock(return_value=model)
-
-    model.transfer_batch_to_device = dm.transfer_batch_to_device
+    get_module_mock.return_value = model
+    if is_overridden('transfer_batch_to_device', dm):
+        model.transfer_batch_to_device = dm.transfer_batch_to_device
 
-    trainer.accelerator_backend = GPUAccelerator(trainer)
     batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
     expected = torch.device('cuda', 0)
     assert dm.hook_called
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index 8412dc3028d59..a63f4107a63fe 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -175,11 +175,11 @@ def configure_optimizers(self):
 
         def optimizer_step(
             self,
-            current_epoch,
-            batch_nb,
+            epoch,
+            batch_idx,
             optimizer,
             optimizer_idx,
-            closure,
+            optimizer_closure,
             on_tpu=False,
             using_native_amp=False,
             using_lbfgs=False
@@ -190,7 +190,7 @@ def optimizer_step(
                 for pg in optimizer.param_groups:
                     pg['lr'] = lr_scale * 0.01
 
-            optimizer.step(closure=closure)
+            optimizer.step(closure=optimizer_closure)
 
     model = TestModel()
     model.training_epoch_end = None
diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py
index 1db8be874e32d..94a8c8f6a5906 100644
--- a/tests/core/test_lightning_optimizer.py
+++ b/tests/core/test_lightning_optimizer.py
@@ -214,7 +214,8 @@ def test_state(tmpdir):
     lightning_dict = {}
     special_attrs = [
         "_accumulate_grad_batches", "_optimizer", "_optimizer_idx", "_support_closure", "_trainer", "__getstate__",
-        "__setstate__", "state_dict", "load_state_dict", "zero_grad", "__setstate__", "add_param_group"
+        "__setstate__", "state_dict", "load_state_dict", "zero_grad", "__setstate__", "add_param_group",
+        "_total_optimizer_step_calls",
     ]
 
     for k, v in lightning_optimizer.__dict__.items():
diff --git a/tests/core/test_memory.py b/tests/core/test_memory.py
index 7ba777633e719..1db6981064c6c 100644
--- a/tests/core/test_memory.py
+++ b/tests/core/test_memory.py
@@ -293,7 +293,12 @@ def test_empty_model_size(mode):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Test requires GPU.")
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
-@pytest.mark.parametrize('precision', [16, 32])
+@pytest.mark.parametrize(
+    'precision', [
+        pytest.param(16, marks=pytest.mark.skip(reason="no longer valid, because 16 can mean mixed precision")),
+        pytest.param(32),
+    ]
+)
 def test_model_size_precision(monkeypatch, tmpdir, precision):
     """ Test model size for half and full precision. """
     model = PreCalculatedModel(precision)
diff --git a/tests/deprecated_api/test_remove_1-4.py b/tests/deprecated_api/test_remove_1-4.py
index 6ecf16edd2a51..b11108c62e445 100644
--- a/tests/deprecated_api/test_remove_1-4.py
+++ b/tests/deprecated_api/test_remove_1-4.py
@@ -24,7 +24,8 @@
     LightningParallelModule,
 )
 from pytorch_lightning.overrides.distributed import LightningDistributedModule
-from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins import DDPSpawnPlugin
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from tests.deprecated_api import _soft_unimport_module
 from tests.helpers import BoringModel
 
@@ -50,8 +51,8 @@ def test_v1_4_0_deprecated_imports():
 def test_v1_4_0_deprecated_trainer_device_distrib():
     """Test that Trainer attributes works fine."""
     trainer = Trainer()
-    trainer._distrib_type = None
-    trainer._device_type = None
+    trainer.accelerator_connector._distrib_type = None
+    trainer.accelerator_connector._device_type = None
 
     with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
         trainer.on_cpu = True
@@ -67,7 +68,7 @@ def test_v1_4_0_deprecated_trainer_device_distrib():
         trainer.on_tpu = True
     with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
         assert trainer.on_tpu
-    trainer._device_type = None
+    trainer.accelerator_connector._device_type = None
     with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
         trainer.use_tpu = True
     with pytest.deprecated_call(match='deprecated in v1.2 and will be removed in v1.4'):
@@ -146,24 +147,23 @@ def test_v1_4_0_deprecated_metrics():
         multiclass_auc_decorator()
 
 
-class CustomDDPPlugin(DDPPlugin):
+class CustomDDPPlugin(DDPSpawnPlugin):
 
-    def configure_ddp(self, model, device_ids):
+    def configure_ddp(self):
         # old, deprecated implementation
         with pytest.deprecated_call(
             match='`LightningDistributedDataParallel` is deprecated since v1.2 and will be removed in v1.4.'
         ):
-            model = LightningDistributedDataParallel(
-                module=model,
-                device_ids=device_ids,
+            self._model = LightningDistributedDataParallel(
+                module=self.lightning_module,
+                device_ids=self.determine_ddp_device_ids(),
                 **self._ddp_kwargs,
             )
-            assert isinstance(model, torch.nn.parallel.DistributedDataParallel)
-            assert isinstance(model.module, LightningDistributedModule)
-        return model
+            assert isinstance(self.model, torch.nn.parallel.DistributedDataParallel)
+            assert isinstance(self.model.module, LightningDistributedModule)
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
 def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir):
     model = BoringModel()
@@ -172,7 +172,12 @@ def test_v1_4_0_deprecated_lightning_distributed_data_parallel(tmpdir):
         fast_dev_run=True,
         gpus=2,
         accelerator="ddp_spawn",
-        plugins=[CustomDDPPlugin()],
+        plugins=[
+            CustomDDPPlugin(
+                parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)],
+                cluster_environment=TorchElasticEnvironment(),
+            )
+        ]
     )
     trainer.fit(model)
 
diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py
index 3f131ab055d98..ec1e81fc2cecb 100644
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@@ -42,11 +42,6 @@ def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50
     for dataloader in test_loaders:
         run_prediction(pretrained_model, dataloader, min_acc=min_acc)
 
-    if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN):
-        # on hpc this would work fine... but need to hack it for the purpose of the test
-        trainer.model = pretrained_model
-        trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers()
-
 
 def run_model_test(
     trainer_options,
@@ -63,7 +58,6 @@ def run_model_test(
     # logger file to get meta
     logger = get_default_logger(save_dir, version=version)
     trainer_options.update(logger=logger)
-
     trainer = Trainer(**trainer_options)
     initial_values = torch.tensor([torch.sum(torch.abs(x)) for x in model.parameters()])
     trainer.fit(model, datamodule=data)
@@ -88,10 +82,8 @@ def run_model_test(
     if with_hpc:
         if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2):
             # on hpc this would work fine... but need to hack it for the purpose of the test
-            trainer.model = pretrained_model
-            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = trainer.init_optimizers(
-                pretrained_model
-            )
+            trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \
+                trainer.init_optimizers(pretrained_model)
 
         # test HPC saving
         trainer.checkpoint_connector.hpc_save(save_dir, logger)
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
index a212e77ffe562..d23f3d5540e78 100644
--- a/tests/helpers/utils.py
+++ b/tests/helpers/utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import functools
 import os
+import traceback
 
 from pytorch_lightning import seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
@@ -92,11 +93,15 @@ def inner_f(queue, **kwargs):
             try:
                 func(**kwargs)
                 queue.put(1)
-            # todo: specify the possible exception
             except Exception:
-                import traceback
-                traceback.print_exc()
-                queue.put(-1)
+                _trace = traceback.format_exc()
+                print(_trace)
+                # code 17 means RuntimeError: tensorflow/compiler/xla/xla_client/mesh_service.cc:364 :
+                # Failed to meet rendezvous 'torch_xla.core.xla_model.save': Socket closed (14)
+                if "terminated with exit code 17" in _trace:
+                    queue.put(1)
+                else:
+                    queue.put(-1)
 
         proc = Process(target=inner_f, args=(queue, ), kwargs=kwargs)
         proc.start()
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 8d620bb563f2e..ff623af963c62 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -21,6 +21,7 @@
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
+from pytorch_lightning.plugins.environments import SLURMEnvironment
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -108,7 +109,15 @@ def test_amp_multi_gpu_ddp_spawn(tmpdir):
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@mock.patch.dict(os.environ, {"SLURM_LOCALID": "0"})
+@mock.patch.dict(
+    os.environ, {
+        "SLURM_NTASKS": "1",
+        "SLURM_JOB_NAME": "SOME_NAME",
+        "SLURM_NODEID": "0",
+        "LOCAL_RANK": "0",
+        "SLURM_LOCALID": "0"
+    }
+)
 def test_amp_gpu_ddp_slurm_managed(tmpdir):
     """Make sure DDP + AMP work."""
     # simulate setting slurm flags
@@ -132,17 +141,18 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
         callbacks=[checkpoint],
         logger=logger,
     )
-    trainer.is_slurm_managing_tasks = True
-    trainer.fit(model)
+    _ = trainer.fit(model)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, 'amp + ddp model failed to complete'
 
     # test root model address
-    assert trainer.slurm_connector.resolve_root_node_address('abc') == 'abc'
-    assert trainer.slurm_connector.resolve_root_node_address('abc[23]') == 'abc23'
-    assert trainer.slurm_connector.resolve_root_node_address('abc[23-24]') == 'abc23'
-    assert trainer.slurm_connector.resolve_root_node_address('abc[23-24, 45-40, 40]') == 'abc23'
+    assert isinstance(trainer.training_type_plugin.cluster_environment, SLURMEnvironment)
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc') == 'abc'
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23]') == 'abc23'
+    assert trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24]') == 'abc23'
+    generated = trainer.training_type_plugin.cluster_environment.resolve_root_node_address('abc[23-24, 45-40, 40]')
+    assert generated == 'abc23'
 
 
 def test_cpu_model_with_amp(tmpdir):
@@ -158,7 +168,7 @@ def test_cpu_model_with_amp(tmpdir):
 
     model = BoringModel()
 
-    with pytest.raises((MisconfigurationException, ModuleNotFoundError)):
+    with pytest.raises(MisconfigurationException, match="AMP is only available on GPU"):
         tpipes.run_model_test(trainer_options, model, on_gpu=False)
 
 
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 2c1d188f8049f..1c3e4b284b2e2 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -21,7 +21,6 @@
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
@@ -69,6 +68,10 @@ def mocked_device_count(monkeypatch):
     def device_count():
         return PRETEND_N_OF_GPUS
 
+    def is_available():
+        return True
+
+    monkeypatch.setattr(torch.cuda, 'is_available', is_available)
     monkeypatch.setattr(torch.cuda, 'device_count', device_count)
 
 
@@ -163,6 +166,7 @@ def test_determine_root_gpu_device(gpus, expected_root_gpu):
     pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"),
     pytest.param([0], [0]),
     pytest.param([1, 3], [1, 3]),
+    pytest.param((1, 3), [1, 3]),
     pytest.param('0', [0]),
     pytest.param('3', [3]),
     pytest.param('1, 3', [1, 3]),
@@ -182,7 +186,6 @@ def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids):
     pytest.param([-1]),
     pytest.param([None]),
     pytest.param(['0']),
-    pytest.param((0, 1)),
 ])
 def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus):
     with pytest.raises(MisconfigurationException):
@@ -212,7 +215,6 @@ def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_coun
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_single_gpu_batch_parse():
     trainer = Trainer(gpus=1)
-    trainer.accelerator_backend = GPUAccelerator(trainer)
 
     # non-transferrable types
     primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}]
@@ -305,7 +307,6 @@ def to(self, *args, **kwargs):
 def test_non_blocking():
     """ Tests that non_blocking=True only gets passed on torch.Tensor.to, but not on other objects. """
     trainer = Trainer()
-    trainer.accelerator_backend = GPUAccelerator(trainer)
 
     batch = torch.zeros(2, 3)
     with patch.object(batch, 'to', wraps=batch.to) as mocked:
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 969597a10f36d..057512be31af2 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 import inspect
 import os
-from unittest.mock import MagicMock
+from unittest import mock
+from unittest.mock import PropertyMock
 
 import pytest
 import torch
 
 from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.accelerators.legacy.gpu_accelerator import GPUAccelerator
 from pytorch_lightning.trainer.states import TrainerState
 from tests.helpers import BoringModel, RandomDataset
 
@@ -144,7 +144,8 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_transfer_batch_hook():
+@mock.patch("pytorch_lightning.accelerators.accelerator.Accelerator.lightning_module", new_callable=PropertyMock)
+def test_transfer_batch_hook(model_getter_mock):
 
     class CustomBatch:
 
@@ -169,9 +170,8 @@ def transfer_batch_to_device(self, data, device):
     batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long)))
 
     trainer = Trainer(gpus=1)
-    trainer.accelerator_backend = GPUAccelerator(trainer)
     # running .fit() would require us to implement custom data loaders, we mock the model reference instead
-    trainer.get_model = MagicMock(return_value=model)
+    model_getter_mock.return_value = model
     batch_gpu = trainer.accelerator_backend.batch_to_device(batch, torch.device('cuda:0'))
     expected = torch.device('cuda', 0)
     assert model.hook_called
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 19f39b3da4c46..060b78a712e10 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -26,7 +26,7 @@
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
-from pytorch_lightning.accelerators.legacy.horovod_accelerator import HorovodAccelerator
+from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _APEX_AVAILABLE, _HOROVOD_AVAILABLE, _NATIVE_AMP_AVAILABLE
@@ -303,13 +303,13 @@ def _compute_batch():
             accelerator='horovod',
         )
 
-        accelerator_backend = trainer.accelerator_connector.select_accelerator()
-        assert isinstance(accelerator_backend, HorovodAccelerator)
+        assert isinstance(trainer.accelerator_backend, CPUAccelerator)
+        # TODO: test that we selected the correct training_type_plugin based on horovod flags
 
         metric = Accuracy(
             compute_on_step=True,
             dist_sync_on_step=True,
-            dist_sync_fn=accelerator_backend.gather_all_tensors,
+            dist_sync_fn=trainer.training_type_plugin.gather_all_tensors,
             threshold=threshold
         )
 
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
index 5d83b992d757e..6ffbba5c75fed 100644
--- a/tests/models/test_sync_batchnorm.py
+++ b/tests/models/test_sync_batchnorm.py
@@ -11,13 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+
 import pytest
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
-from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
+from pytorch_lightning.plugins import DDPSpawnPlugin
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import FLOAT16_EPSILON
 from tests.helpers.datamodules import MNISTDataModule
@@ -68,6 +71,9 @@ def configure_optimizers(self):
 # TODO: Fatal Python error: Bus error
 @pytest.mark.skip(reason="Fatal Python error: Bus error")
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
 def test_sync_batchnorm_ddp(tmpdir):
     seed_everything(234)
     set_random_master_port()
@@ -112,7 +118,15 @@ def test_sync_batchnorm_ddp(tmpdir):
         sync_batchnorm=True,
         num_sanity_val_steps=0,
         replace_sampler_ddp=False,
-        plugins=[DDPPlugin(find_unused_parameters=True)]
+        plugins=[
+            DDPSpawnPlugin(
+                parallel_devices=[torch.device("cuda", 0), torch.device("cuda", 1)],
+                num_nodes=1,
+                sync_batchnorm=True,
+                cluster_environment=TorchElasticEnvironment(),
+                find_unused_parameters=True
+            )
+        ]
     )
 
     trainer.fit(model, dm)
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index e5895d98b6fcb..d9ea8a9917d2b 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -23,6 +23,7 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.accelerators import TPUAccelerator
 from pytorch_lightning.callbacks import EarlyStopping
+from pytorch_lightning.plugins import TPUSpawnPlugin
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -59,7 +60,7 @@ def test_model_tpu_cores_1(tmpdir):
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
-        max_epochs=1,
+        max_epochs=2,
         tpu_cores=1,
         limit_train_batches=4,
         limit_val_batches=4,
@@ -78,7 +79,7 @@ def test_model_tpu_index(tmpdir, tpu_core):
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
-        max_epochs=1,
+        max_epochs=2,
         tpu_cores=[tpu_core],
         limit_train_batches=4,
         limit_val_batches=4,
@@ -99,8 +100,8 @@ def test_model_tpu_cores_8(tmpdir):
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=8,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
     # 8 cores needs a big dataset
@@ -117,10 +118,10 @@ def test_model_16bit_tpu_cores_1(tmpdir):
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
-        max_epochs=1,
+        max_epochs=2,
         tpu_cores=1,
-        limit_train_batches=4,
-        limit_val_batches=4,
+        limit_train_batches=8,
+        limit_val_batches=2,
     )
 
     model = BoringModel()
@@ -138,7 +139,7 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core):
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
-        max_epochs=1,
+        max_epochs=2,
         tpu_cores=[tpu_core],
         limit_train_batches=4,
         limit_val_batches=2,
@@ -161,8 +162,8 @@ def test_model_16bit_tpu_cores_8(tmpdir):
         progress_bar_refresh_rate=0,
         max_epochs=1,
         tpu_cores=8,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
+        limit_train_batches=4,
+        limit_val_batches=4,
     )
 
     # 8 cores needs a big dataset
@@ -175,6 +176,8 @@ def test_model_16bit_tpu_cores_8(tmpdir):
 def test_model_tpu_early_stop(tmpdir):
     """Test if single TPU core training works"""
 
+    # todo: Test on 8 cores - hanging.
+
     class CustomBoringModel(BoringModel):
 
         def validation_step(self, *args, **kwargs):
@@ -188,10 +191,10 @@ def validation_step(self, *args, **kwargs):
         callbacks=[EarlyStopping(monitor='val_loss')],
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
-        max_epochs=50,
-        limit_train_batches=4,
-        limit_val_batches=4,
-        tpu_cores=1,
+        max_epochs=2,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        tpu_cores=[1],
     )
     trainer.fit(model)
 
@@ -204,11 +207,11 @@ def test_tpu_grad_norm(tmpdir):
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
-        max_epochs=1,
+        max_epochs=4,
         tpu_cores=1,
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
-        gradient_clip_val=0.1,
+        limit_train_batches=4,
+        limit_val_batches=4,
+        gradient_clip_val=0.5,
     )
 
     model = BoringModel()
@@ -236,7 +239,7 @@ def test_dataloaders_passed_to_fit(tmpdir):
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires missing TPU")
 def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id):
     """Test if trainer.tpu_id is set as expected"""
-    assert Trainer(tpu_cores=tpu_cores).tpu_id == expected_tpu_id
+    assert Trainer(tpu_cores=tpu_cores).accelerator_connector.tpu_id == expected_tpu_id
 
 
 def test_tpu_misconfiguration():
@@ -261,15 +264,19 @@ def test_distributed_backend_set_when_using_tpu(tmpdir, tpu_cores):
 
 
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
 @pl_multi_process_test
 def test_broadcast_on_tpu():
     """ Checks if an object from the master process is broadcasted to other processes correctly"""
 
     def test_broadcast(rank):
         trainer = Trainer(tpu_cores=8)
-        backend = TPUAccelerator(trainer)
+        assert isinstance(trainer.accelerator_backend, TPUAccelerator)
+        assert isinstance(trainer.training_type_plugin, TPUSpawnPlugin)
         obj = ("ver_0.5", "logger_name", rank)
-        result = backend.broadcast(obj)
+        result = trainer.training_type_plugin.broadcast(obj)
         assert result == ("ver_0.5", "logger_name", 0)
 
     xmp.spawn(test_broadcast, nprocs=8, start_method='fork')
@@ -299,7 +306,7 @@ def test_tpu_choice(tmpdir, tpu_cores, expected_tpu_id, error_expected):
             Trainer(default_root_dir=tmpdir, tpu_cores=tpu_cores)
     else:
         trainer = Trainer(default_root_dir=tmpdir, tpu_cores=tpu_cores)
-        assert trainer.tpu_id == expected_tpu_id
+        assert trainer.accelerator_connector.tpu_id == expected_tpu_id
 
 
 @pytest.mark.parametrize(
diff --git a/tests/plugins/legacy/__init__.py b/tests/plugins/legacy/__init__.py
deleted file mode 100644
index b1fca65e60042..0000000000000
--- a/tests/plugins/legacy/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# todo: feel free to move any of these "legacy" tests up...
diff --git a/tests/plugins/legacy/test_ddp_plugin.py b/tests/plugins/legacy/test_ddp_plugin.py
deleted file mode 100644
index 9ec5078811475..0000000000000
--- a/tests/plugins/legacy/test_ddp_plugin.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import os
-import platform
-from unittest import mock
-
-import pytest
-
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.legacy.sharded_plugin import DDPShardedPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers.boring_model import BoringModel
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPPlugin)
-            raise RuntimeError('finished plugin check')
-
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        callbacks=[CB()],
-    )
-
-    with pytest.raises(RuntimeError, match='finished plugin check'):
-        trainer.fit(model)
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-def test_ddp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
-
-    class MyDDP(DDPPlugin):
-        pass
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, MyDDP)
-            raise RuntimeError('finished plugin check')
-
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins=[MyDDP()],
-        callbacks=[CB()],
-    )
-
-    with pytest.raises(RuntimeError, match='finished plugin check'):
-        trainer.fit(model)
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed sharded plugin is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-def test_ddp_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin)
-            raise RuntimeError('finished plugin check')
-
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins='ddp_sharded',
-        callbacks=[CB()],
-    )
-
-    with pytest.raises(RuntimeError, match='finished plugin check'):
-        trainer.fit(model)
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-def test_ddp_invalid_choice_string_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
-    with pytest.raises(MisconfigurationException, match='not a supported lightning custom plugin'):
-        Trainer(
-            fast_dev_run=True,
-            gpus=gpus,
-            num_processes=num_processes,
-            accelerator=ddp_backend,
-            plugins='invalid',
-        )
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed sharded plugin is not supported on Windows")
-@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-def test_ddp_invalid_choice_string_and_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
-    """
-    Test passing a lightning custom ddp plugin and a default ddp plugin throws an error.
-    """
-
-    class MyDDP(DDPPlugin):
-        pass
-
-    with pytest.raises(MisconfigurationException, match='you can only use one DDP plugin in plugins'):
-        Trainer(
-            fast_dev_run=True,
-            gpus=gpus,
-            num_processes=num_processes,
-            accelerator=ddp_backend,
-            plugins=['ddp_sharded', MyDDP()],
-        )
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-def test_ddp_choice_custom_ddp_cpu_custom_args(tmpdir, ddp_backend, gpus, num_processes):
-
-    class MyDDP(DDPPlugin):
-        pass
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, MyDDP)
-            raise RuntimeError('finished plugin check')
-
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins=[MyDDP(broadcast_buffers=False, find_unused_parameters=True)],
-        callbacks=[CB()],
-    )
-
-    with pytest.raises(RuntimeError, match='finished plugin check'):
-        trainer.fit(model)
diff --git a/tests/plugins/legacy/test_plugin.py b/tests/plugins/legacy/test_plugin.py
deleted file mode 100644
index 180315d59a310..0000000000000
--- a/tests/plugins/legacy/test_plugin.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from unittest import mock
-
-import pytest
-
-from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.plugins.legacy.ddp_plugin import DDPPlugin
-from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
-from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.helpers.boring_model import BoringModel
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-def test_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes):
-    """
-    Test to ensure that if a plugin requires certain plugin to be added, these are added automatically
-    """
-
-    class RequiredPlugin(NativeAMPPlugin):
-        """
-        My custom amp plugin that's required with my DDP plugin as default.
-        This allows us to ensure this plugin is added when using CustomPlugin rather than ensuring
-        the user passes it manually into the list.
-        """
-
-    class CustomPlugin(DDPPlugin):
-
-        def required_plugins(self, amp_backend: AMPType, trainer: Trainer) -> list:
-            return [RequiredPlugin(trainer=trainer)]
-
-    class CB(Callback):
-
-        def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, CustomPlugin)
-            assert isinstance(trainer.precision_connector.backend, RequiredPlugin)
-            raise RuntimeError('finished plugin check')
-
-    model = BoringModel()
-    with pytest.warns(
-        UserWarning,
-        match=f'plugin {type(CustomPlugin())} has added additional '
-        f'required plugins as default: {[type(RequiredPlugin())]}*'
-    ):
-        trainer = Trainer(
-            fast_dev_run=True,
-            gpus=gpus,
-            num_processes=num_processes,
-            accelerator=ddp_backend,
-            plugins=[CustomPlugin()],
-            callbacks=[CB()],
-        )
-    with pytest.raises(RuntimeError, match='finished plugin check'):
-        trainer.fit(model)
-
-
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
-def test_invalid_custom_required_plugins(tmpdir, ddp_backend, gpus, num_processes):
-    """
-    Test to ensure if the user passes a plugin that conflicts with the required defaults of another plugin,
-    we throw a warning and error.
-    The user has to override the required defaults plugin.
-    """
-
-    class RequiredPlugin(NativeAMPPlugin):
-        """
-        My custom amp plugin that's required with my DDP plugin as default.
-        This allows us to ensure this plugin is added when using CustomPlugin rather than ensuring
-        the user passes it manually into the list.
-        """
-
-    class CustomPlugin(DDPPlugin):
-
-        def required_plugins(self, amp_backend: AMPType, trainer: Trainer) -> list:
-            return [RequiredPlugin(trainer=trainer)]
-
-    with pytest.warns(UserWarning, match=f'plugin {type(CustomPlugin())} has added additional '
-                                         f'required plugins as default: {[type(RequiredPlugin())]}*'), \
-         pytest.raises(MisconfigurationException, match=f"you can only use one {type(NativeAMPPlugin)}"
-                                                        f" in plugins. You passed in: {2}"):
-        Trainer(
-            fast_dev_run=True,
-            gpus=gpus,
-            num_processes=num_processes,
-            accelerator=ddp_backend,
-            plugins=[CustomPlugin(), NativeAMPPlugin()],
-        )
diff --git a/tests/plugins/legacy/test_plugin_properties.py b/tests/plugins/legacy/test_plugin_properties.py
deleted file mode 100644
index 1a6556c0f76ff..0000000000000
--- a/tests/plugins/legacy/test_plugin_properties.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.legacy.plugin_connector import LightningCustomPlugins, PluginConnector
-
-
-def test_available_plugins_trainer():
-    """ Test that available plugins return the correct list in the trainer. """
-    plugins = Trainer.available_plugins()
-    expected_plugins = [e.name for e in LightningCustomPlugins]
-    assert plugins == expected_plugins
-
-
-def test_available_plugins_connector():
-    """ Test that available plugins return the correct list in the connector. """
-    plugins = PluginConnector.available_plugins()
-    expected_plugins = [e.name for e in LightningCustomPlugins]
-    assert plugins == expected_plugins
diff --git a/tests/plugins/legacy/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py
similarity index 80%
rename from tests/plugins/legacy/test_amp_plugin.py
rename to tests/plugins/test_amp_plugin.py
index ec5f60bb72e7e..80a06b0072e1e 100644
--- a/tests/plugins/legacy/test_amp_plugin.py
+++ b/tests/plugins/test_amp_plugin.py
@@ -6,8 +6,9 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.legacy.native_amp import NativeAMPPlugin
+from pytorch_lightning.plugins import NativeMixedPrecisionPlugin
 from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 
 
@@ -27,28 +28,34 @@
     ['ddp_backend', 'gpus', 'num_processes'],
     [('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
 )
-def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
+def on_fit_start(tmpdir, ddp_backend, gpus, num_processes):
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.precision_connector.backend, NativeAMPPlugin)
+            assert isinstance(trainer.precision_plugin, NativeMixedPrecisionPlugin)
             raise SystemExit()
 
-    model = BoringModel()
-    trainer = Trainer(
-        fast_dev_run=True,
-        precision=16,
-        amp_backend='native',
-        gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        callbacks=[CB()],
-    )
-
-    with pytest.raises(SystemExit):
+    def train():
+        model = BoringModel()
+        trainer = Trainer(
+            fast_dev_run=True,
+            precision=16,
+            amp_backend='native',
+            gpus=gpus,
+            num_processes=num_processes,
+            accelerator=ddp_backend,
+            callbacks=[CB()],
+        )
         trainer.fit(model)
 
+    if ddp_backend == "ddp_cpu":
+        with pytest.raises(MisconfigurationException, match="MP is only available on GPU"):
+            train()
+    else:
+        with pytest.raises(SystemExit):
+            train()
+
 
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
 @mock.patch.dict(
@@ -68,13 +75,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
 
-    class MyNativeAMP(NativeAMPPlugin):
+    class MyNativeAMP(NativeMixedPrecisionPlugin):
         pass
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.precision_connector.backend, MyNativeAMP)
+            assert isinstance(trainer.precision_plugin, MyNativeAMP)
             raise SystemExit()
 
     model = BoringModel()
@@ -82,7 +89,6 @@ def on_fit_start(self, trainer, pl_module):
         fast_dev_run=True,
         precision=16,
         amp_backend='native',
-        gpus=gpus,
         num_processes=num_processes,
         accelerator=ddp_backend,
         plugins=[MyNativeAMP()],
diff --git a/tests/plugins/legacy/test_apex_plugin.py b/tests/plugins/test_apex_plugin.py
similarity index 87%
rename from tests/plugins/legacy/test_apex_plugin.py
rename to tests/plugins/test_apex_plugin.py
index c816f63bd7595..91d42822db57b 100644
--- a/tests/plugins/legacy/test_apex_plugin.py
+++ b/tests/plugins/test_apex_plugin.py
@@ -5,7 +5,7 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.legacy.apex import ApexPlugin
+from pytorch_lightning.plugins import ApexMixedPrecisionPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from tests.helpers.boring_model import BoringModel
 
@@ -31,7 +31,7 @@ def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.precision_connector.backend, ApexPlugin)
+            assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
             raise SystemExit()
 
     model = BoringModel()
@@ -67,13 +67,13 @@ def on_fit_start(self, trainer, pl_module):
 )
 def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
 
-    class MyApexPlugin(ApexPlugin):
+    class MyApexPlugin(ApexMixedPrecisionPlugin):
         pass
 
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.precision_connector.backend, MyApexPlugin)
+            assert isinstance(trainer.precision_plugin, MyApexPlugin)
             raise SystemExit()
 
     model = BoringModel()
@@ -84,7 +84,7 @@ def on_fit_start(self, trainer, pl_module):
         gpus=gpus,
         num_processes=num_processes,
         accelerator=ddp_backend,
-        plugins=[MyApexPlugin()],
+        plugins=[MyApexPlugin(amp_level="O2")],
         callbacks=[CB()],
     )
 
diff --git a/tests/plugins/legacy/test_rpc_plugin.py b/tests/plugins/test_rpc_plugin.py
similarity index 58%
rename from tests/plugins/legacy/test_rpc_plugin.py
rename to tests/plugins/test_rpc_plugin.py
index d5ddced7c4869..2c074e6c3afda 100644
--- a/tests/plugins/legacy/test_rpc_plugin.py
+++ b/tests/plugins/test_rpc_plugin.py
@@ -5,9 +5,9 @@
 import pytest
 import torch
 
-from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.legacy.rpc_plugin import RPCPlugin
+from pytorch_lightning.plugins.training_type.rpc_sequential import RPCPlugin
 from pytorch_lightning.utilities import _RPC_AVAILABLE
 from tests.helpers.boring_model import BoringModel
 
@@ -26,7 +26,7 @@
 @mock.patch("torch.cuda.device_count", return_value=2)
 @pytest.mark.parametrize(
     ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
+    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp_spawn", 2, 0)],
 )
 @pytest.mark.skipif(not _RPC_AVAILABLE, reason="RPC is not available")
 def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes):
@@ -34,7 +34,7 @@ def test_rpc_choice(tmpdir, ddp_backend, gpus, num_processes):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, RPCPlugin)
+            assert isinstance(trainer.training_type_plugin, RPCPlugin)
             raise RuntimeError('finished plugin check')
 
     model = BoringModel()
@@ -56,34 +56,11 @@ class CustomRPCPlugin(RPCPlugin):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.rpc_save_model_count = 0
-        self.on_main_rpc_connect_count = 0
         self.worker_optimizer_step_count = 0
-        self.is_main_rpc_process_count = 0
-        self.on_exit_rpc_process_count = 0
-        self.return_after_exit_rpc_process_count = 0
-
-    def on_accelerator_exit_rpc_process(self, trainer) -> None:
-        self.on_exit_rpc_process_count += 1
 
     def rpc_save_model(self, save_model_fn, last_filepath, trainer, pl_module) -> None:
         self.rpc_save_model_count += 1
 
-    def on_main_rpc_connection(self, trainer) -> None:
-        self.on_main_rpc_connect_count += 1
-
-    def worker_optimizer_step(self, model: LightningModule, opt_idx: int, *args, **kwargs) -> None:
-        self.worker_optimizer_step_count += 1
-
-    @property
-    def is_main_rpc_process(self) -> bool:
-        self.is_main_rpc_process_count += 1
-        return torch.distributed.get_rank() == 0
-
-    @property
-    def return_after_exit_rpc_process(self) -> bool:
-        self.return_after_exit_rpc_process_count += 1
-        return False
-
     def barrier(self, name: Optional[str] = None) -> None:
         return
 
@@ -111,17 +88,5 @@ def test_rpc_function_calls_ddp(tmpdir):
     trainer.fit(model)
     if trainer.global_rank == 0:  # Main process
         assert plugin.rpc_save_model_count == max_epochs
-        assert plugin.on_main_rpc_connect_count == 1
-        assert plugin.worker_optimizer_step_count == max_epochs * limit_train_batches
-        # Call once at init, and at optim step
-        assert plugin.is_main_rpc_process_count == 1 + plugin.worker_optimizer_step_count
-        assert plugin.on_exit_rpc_process_count == 0
     else:  # Worker process
         assert plugin.rpc_save_model_count == max_epochs
-        assert plugin.on_main_rpc_connect_count == 0
-        # Never signaled by worker, only by main process
-        assert plugin.worker_optimizer_step_count == 0
-        # Call once at init, and at optim step
-        assert plugin.is_main_rpc_process_count == 1 + (max_epochs * limit_train_batches)
-        # Called at init
-        assert plugin.on_exit_rpc_process_count == 1
diff --git a/tests/plugins/legacy/test_ddp_sequential_plugin.py b/tests/plugins/test_rpc_sequential_plugin.py
similarity index 82%
rename from tests/plugins/legacy/test_ddp_sequential_plugin.py
rename to tests/plugins/test_rpc_sequential_plugin.py
index 744a872b00405..d357161a27747 100644
--- a/tests/plugins/legacy/test_ddp_sequential_plugin.py
+++ b/tests/plugins/test_rpc_sequential_plugin.py
@@ -20,26 +20,19 @@
 from torch import nn
 
 from pytorch_lightning import LightningModule, Trainer
-from pytorch_lightning.plugins.legacy.ddp_sequential_plugin import DDPSequentialPlugin
+from pytorch_lightning.plugins.training_type.rpc_sequential import RPCSequentialPlugin
 from pytorch_lightning.utilities import _FAIRSCALE_PIPE_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import RandomDataset
 
 
-def cleanup(ctx, model):
-    """
-    Cleanup function required to ensure we delete the pipe module at the end of the the test on all workers
-    """
-    del model
-
-
 @pytest.mark.skipif(not _FAIRSCALE_PIPE_AVAILABLE, reason="test requires FairScale to be installed")
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
-def test_ddp_sequential_plugin_ddp_rpc_manual(tmpdir, args=None):
+def test_rpc_sequential_plugin_manual(tmpdir, args=None):
     model = SequentialModelRPCManual()
     trainer = Trainer(
         max_epochs=2,
@@ -48,18 +41,18 @@ def test_ddp_sequential_plugin_ddp_rpc_manual(tmpdir, args=None):
         limit_test_batches=2,
         gpus=2,
         distributed_backend="ddp",
-        plugins=[DDPSequentialPlugin(balance=[2, 1], rpc_timeout_sec=5 * 60)],
+        plugins=[RPCSequentialPlugin(balance=[2, 1], rpc_timeout_sec=5 * 60)],
         enable_pl_optimizer=True,
     )
 
     trainer.fit(model)
 
-    if torch_distrib.get_rank() == 0:
+    if torch_distrib.is_initialized() and torch_distrib.get_rank() == 0:
         assert len(trainer.dev_debugger.pbar_added_metrics) > 0
 
     if trainer.accelerator_backend.rpc_enabled:
         # Called at the end of trainer to ensure all processes are killed
-        trainer.accelerator_backend.ddp_plugin.exit_rpc_process()
+        trainer.accelerator_backend.training_type_plugin.exit_rpc_process()
 
 
 @pytest.mark.skipif(not _FAIRSCALE_PIPE_AVAILABLE, reason="test requires FairScale to be installed")
@@ -68,7 +61,7 @@ def test_ddp_sequential_plugin_ddp_rpc_manual(tmpdir, args=None):
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
-def test_ddp_sequential_plugin_ddp_rpc_manual_amp(tmpdir, args=None):
+def test_rpc_sequential_plugin_manual_amp(tmpdir, args=None):
     model = SequentialModelRPCManual()
     trainer = Trainer(
         max_epochs=2,
@@ -79,16 +72,14 @@ def test_ddp_sequential_plugin_ddp_rpc_manual_amp(tmpdir, args=None):
         precision=16,
         amp_backend="native",
         distributed_backend="ddp",
-        plugins=[DDPSequentialPlugin(balance=[2, 1])],
+        plugins=[RPCSequentialPlugin(balance=[2, 1])],
     )
-    try:
+    with pytest.raises(
+        MisconfigurationException,
+        match='`RPCSequentialPlugin` is currently not supported in Automatic Mixed Precision'
+    ):
         trainer.fit(model)
 
-        assert len(trainer.dev_debugger.pbar_added_metrics) > 0
-
-    except MisconfigurationException as e:
-        assert str(e) == 'DDPSequentialPlugin is currently not supported in Automatic Mixed Precision'
-
 
 @pytest.mark.skipif(not _FAIRSCALE_PIPE_AVAILABLE, reason="test requires FairScale to be installed")
 @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
@@ -96,7 +87,7 @@ def test_ddp_sequential_plugin_ddp_rpc_manual_amp(tmpdir, args=None):
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
-def test_ddp_sequential_plugin_ddp_rpc_automatic(tmpdir, args=None):
+def test_rpc_sequential_plugin_automatic(tmpdir, args=None):
     model = SequentialModelRPCAutomatic()
     trainer = Trainer(
         max_epochs=2,
@@ -105,18 +96,17 @@ def test_ddp_sequential_plugin_ddp_rpc_automatic(tmpdir, args=None):
         limit_test_batches=2,
         gpus=2,
         distributed_backend="ddp",
-        plugins=[DDPSequentialPlugin(balance=[2, 1])],
+        plugins=[RPCSequentialPlugin(balance=[2, 1])],
     )
 
     trainer.fit(model)
 
-    if torch_distrib.get_rank() == 0:
+    if torch_distrib.is_initialized() and torch_distrib.get_rank() == 0:
         assert len(trainer.dev_debugger.pbar_added_metrics) > 0
 
     if trainer.accelerator_backend.rpc_enabled:
-
         # Called at the end of trainer to ensure all processes are killed
-        trainer.accelerator_backend.ddp_plugin.exit_rpc_process()
+        trainer.accelerator_backend.training_type_plugin.exit_rpc_process()
 
 
 @pytest.mark.skipif(not _FAIRSCALE_PIPE_AVAILABLE, reason="test requires FairScale to be installed")
@@ -125,7 +115,7 @@ def test_ddp_sequential_plugin_ddp_rpc_automatic(tmpdir, args=None):
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
-def test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance(tmpdir, args=None):
+def test_rpc_sequential_plugin_with_wrong_balance(tmpdir, args=None):
     model = SequentialModelRPCAutomatic()
     trainer = Trainer(
         max_epochs=2,
@@ -134,18 +124,17 @@ def test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance(tmpdir, args=None):
         limit_test_batches=2,
         gpus=2,
         distributed_backend="ddp",
-        plugins=[DDPSequentialPlugin(balance=[2, 2])],
+        plugins=[RPCSequentialPlugin(balance=[2, 2])],
     )
 
-    try:
+    with pytest.raises(
+        MisconfigurationException, match="The provided balance sum: 4 does not match your Sequential length: 3"
+    ):
         trainer.fit(model)
 
-    except MisconfigurationException as e:
-        assert str(e) == 'The provided balance sum: 4 does not match your Sequential length: 3'
-
     if trainer.accelerator_backend.rpc_enabled:
         # Called at the end of trainer to ensure all processes are killed
-        trainer.accelerator_backend.ddp_plugin.exit_rpc_process()
+        trainer.accelerator_backend.training_type_plugin.exit_rpc_process()
 
 
 class SequentialModelRPCManual(LightningModule):
diff --git a/tests/plugins/legacy/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
similarity index 71%
rename from tests/plugins/legacy/test_sharded_plugin.py
rename to tests/plugins/test_sharded_plugin.py
index 55975146a4064..a3c7ca61f2b47 100644
--- a/tests/plugins/legacy/test_sharded_plugin.py
+++ b/tests/plugins/test_sharded_plugin.py
@@ -1,37 +1,20 @@
 import os
 import platform
-from unittest import mock
 
 import pytest
 import torch
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import Callback
-from pytorch_lightning.plugins.legacy.sharded_native_amp_plugin import ShardedNativeAMPPlugin
-from pytorch_lightning.plugins.legacy.sharded_plugin import _FAIRSCALE_AVAILABLE, DDPShardedPlugin
-from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.plugins import DDPShardedPlugin, DDPSpawnShardedPlugin
+from pytorch_lightning.utilities import _APEX_AVAILABLE, _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
+@pytest.mark.parametrize(["accelerator"], [("ddp_sharded", ), ("ddp_sharded_spawn", )])
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
-def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes):
+def test_sharded_ddp_choice(tmpdir, accelerator):
     """
         Test to ensure that plugin is correctly chosen
     """
@@ -39,16 +22,16 @@ def test_ddp_choice_sharded(tmpdir, ddp_backend, gpus, num_processes):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin)
+            if accelerator == 'ddp_sharded':
+                assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPShardedPlugin)
+            elif accelerator == 'ddp_sharded_spawn':
+                assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPSpawnShardedPlugin)
             raise SystemExit()
 
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        gpus=gpus,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins=[DDPShardedPlugin()],
+        accelerator=accelerator,
         callbacks=[CB()],
     )
 
@@ -67,8 +50,7 @@ def test_invalid_apex_sharded(tmpdir):
     with pytest.raises(MisconfigurationException, match='Sharded Plugin is not supported with Apex AMP'):
         trainer = Trainer(
             fast_dev_run=True,
-            accelerator='ddp_spawn',
-            plugins=[DDPShardedPlugin()],
+            accelerator='ddp_sharded_spawn',
             precision=16,
             amp_backend='apex',
         )
@@ -76,25 +58,11 @@ def test_invalid_apex_sharded(tmpdir):
         trainer.fit(model)
 
 
-@mock.patch.dict(
-    os.environ,
-    {
-        "CUDA_VISIBLE_DEVICES": "0,1",
-        "SLURM_NTASKS": "2",
-        "SLURM_JOB_NAME": "SOME_NAME",
-        "SLURM_NODEID": "0",
-        "LOCAL_RANK": "0",
-        "SLURM_LOCALID": "0",
-    },
-)
-@mock.patch("torch.cuda.device_count", return_value=2)
-@pytest.mark.parametrize(
-    ["ddp_backend", "gpus", "num_processes"],
-    [("ddp_cpu", None, 2), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)],
-)
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires GPU machine")
+@pytest.mark.parametrize(["accelerator"], [("ddp_sharded", ), ("ddp_sharded_spawn", )])
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Requires native AMP")
-def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes):
+def test_ddp_choice_sharded_amp(tmpdir, accelerator):
     """
         Test to ensure that plugin native amp plugin is correctly chosen when using sharded
     """
@@ -102,18 +70,18 @@ def test_ddp_choice_sharded_amp(tmpdir, ddp_backend, gpus, num_processes):
     class CB(Callback):
 
         def on_fit_start(self, trainer, pl_module):
-            assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPShardedPlugin)
-            assert isinstance(trainer.precision_connector.backend, ShardedNativeAMPPlugin)
+            if accelerator == 'ddp_sharded':
+                assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPShardedPlugin)
+            elif accelerator == 'ddp_sharded_spawn':
+                assert isinstance(trainer.accelerator_backend.training_type_plugin, DDPSpawnShardedPlugin)
             raise SystemExit()
 
     model = BoringModel()
     trainer = Trainer(
         fast_dev_run=True,
-        gpus=gpus,
+        gpus=1,
         precision=16,
-        num_processes=num_processes,
-        accelerator=ddp_backend,
-        plugins=[DDPShardedPlugin()],
+        accelerator=accelerator,
         callbacks=[CB()],
     )
 
@@ -129,9 +97,8 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
@@ -143,7 +110,7 @@ def test_ddp_sharded_plugin_checkpoint_cpu(tmpdir):
 
     # Assert model parameters are identical after loading
     for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(ddp_param, shard_param)
+        assert torch.equal(ddp_param.to("cpu"), shard_param)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -156,8 +123,7 @@ def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         gpus=2,
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
     )
 
@@ -169,7 +135,7 @@ def test_ddp_sharded_plugin_checkpoint_multi_gpu(tmpdir):
 
     # Assert model parameters are identical after loading
     for ddp_param, shard_param in zip(model.parameters(), saved_model.parameters()):
-        assert torch.equal(ddp_param, shard_param)
+        assert torch.equal(ddp_param.to("cpu"), shard_param)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -182,8 +148,7 @@ def test_ddp_sharded_plugin_finetune(tmpdir):
     model = BoringModel()
     trainer = Trainer(
         gpus=2,
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
     )
     trainer.fit(model)
@@ -204,9 +169,8 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
@@ -218,11 +182,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        accelerator='ddp_cpu',
-        num_processes=2,
-        plugins=[DDPShardedPlugin()],
-        fast_dev_run=True,
-        resume_from_checkpoint=checkpoint_path
+        accelerator='ddp_sharded_spawn', num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path,
     )
 
     trainer.fit(model)
@@ -239,8 +199,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         fast_dev_run=True,
         gpus=2,
     )
@@ -253,11 +212,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_downsize_gpus(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
-        fast_dev_run=True,
-        gpus=1,
-        resume_from_checkpoint=checkpoint_path
+        accelerator='ddp_sharded_spawn', fast_dev_run=True, gpus=1, resume_from_checkpoint=checkpoint_path,
     )
 
     trainer.fit(model)
@@ -272,8 +227,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_spawn',
-        plugins=[DDPShardedPlugin()],
+        accelerator='ddp_sharded_spawn',
         gpus=1,
         fast_dev_run=True,
     )
@@ -286,11 +240,7 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
     model = BoringModel()
 
     trainer = Trainer(
-        plugins=[DDPShardedPlugin()],
-        accelerator='ddp_cpu',
-        num_processes=2,
-        fast_dev_run=True,
-        resume_from_checkpoint=checkpoint_path
+        accelerator='ddp_sharded_spawn', num_processes=2, fast_dev_run=True, resume_from_checkpoint=checkpoint_path,
     )
 
     trainer.fit(model)
@@ -298,15 +248,17 @@ def test_ddp_sharded_plugin_resume_from_checkpoint_gpu_to_cpu(tmpdir):
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
 def test_ddp_sharded_plugin_test(tmpdir):
     """
         Test to ensure we can use test without fit
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_cpu',
+        accelerator='ddp_sharded_spawn',
         num_processes=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
@@ -322,9 +274,8 @@ def test_ddp_sharded_plugin_test_multigpu(tmpdir):
     """
     model = BoringModel()
     trainer = Trainer(
-        accelerator='ddp_spawn',
+        accelerator='ddp_sharded_spawn',
         gpus=2,
-        plugins=[DDPShardedPlugin()],
         fast_dev_run=True,
     )
 
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 577e49cec49d2..3ad6e65512585 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -16,12 +16,13 @@ set -e
 export PL_RUNNING_SPECIAL_TESTS=1
 DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no"
 python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp
-python ${DEFAULTS} tests/plugins/legacy/test_rpc_plugin.py::test_rpc_function_calls_ddp
-python ${DEFAULTS} tests/plugins/legacy/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual
-python ${DEFAULTS} tests/plugins/legacy/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp
-python ${DEFAULTS} tests/plugins/legacy/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic
+python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp
+python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
+python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual
+python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp
+python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic
+python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance
 python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection
-# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance
 python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp
 python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp
 python ${DEFAULTS} tests/trainer/logging_/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index a6b2fd1ef649d..807c5585ea5bc 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import collections
 import os
+from copy import deepcopy
 from unittest import mock
 from unittest.mock import ANY, call, patch
 
@@ -22,6 +23,7 @@
 import torch.nn.functional as F
 
 from pytorch_lightning import seed_everything, Trainer
+from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.utilities import _APEX_AVAILABLE
 from tests.helpers.boring_model import BoringModel
 
@@ -344,7 +346,7 @@ def training_step(self, batch, batch_idx, optimizer_idx):
             # ensure we forward the correct params to the optimizer
             # without retain_graph we can't do multiple backward passes
             self.manual_backward(loss_2, opt_b, retain_graph=True)
-            self.manual_backward(loss_2, opt_a, retain_graph=True)
+            self.manual_backward(loss_2, opt_a)
 
             assert self.layer.weight.grad is not None
             opt_b.step()
@@ -545,7 +547,7 @@ def training_step(self, batch, batch_idx):
             if self.should_update:
 
                 self.manual_backward(loss, opt)
-                opt.step()
+                opt.step(make_optimizer_step=self.should_have_updated)
 
             return loss.detach() if self.detach else loss
 
@@ -564,7 +566,7 @@ def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
                         assert torch.sum(self.layer.weight.grad) != 0
             self.count += 1
 
-        def on_train_end(self):
+        def on_train_epoch_end(self, *_, **__):
             assert self.called["training_step"] == 20
             assert self.called["on_train_batch_start"] == 20
             assert self.called["on_train_batch_end"] == 20
@@ -838,7 +840,7 @@ def optimizer_closure():
                     retain_graph = num_backward != backward_idx  # noqa E225
                     self.manual_backward(loss_1, opt, retain_graph=retain_graph)
 
-            opt.step(closure=optimizer_closure)
+            opt.step(closure=optimizer_closure, make_optimizer_step=True)
 
         def training_epoch_end(self, outputs) -> None:
             # outputs should be an array with an entry per optimizer
@@ -947,95 +949,100 @@ def configure_optimizers(self):
     mock_adam_step.assert_has_calls(expected_calls)
 
 
-@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@patch("torch.optim.Adam.step")
-@patch("torch.optim.SGD.step")
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.skipif(
-    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
-)
-def test_step_with_optimizer_closure_with_different_frequencies_ddp(mock_sgd_step, mock_adam_step, tmpdir):
-    """
-    Tests that `step` works with optimizer_closure and different accumulated_gradient frequency
-    """
+class TestManualOptimizationDDPCallack(Callback):
 
-    class TestModel(BoringModel):
+    def on_train_end(self, trainer, pl_module):
 
-        def __init__(self):
-            super().__init__()
-            self.automatic_optimization = False
+        opt_a, opt_b = pl_module.optimizers()
+        assert opt_a._total_optimizer_step_calls == 4
+        assert opt_b._total_optimizer_step_calls == 2
 
-        def loss_ones(self, batch, prediction):
-            # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
-            return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
 
-        def loss_zeros(self, batch, prediction):
-            # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
-            return torch.nn.functional.mse_loss(prediction, torch.zeros_like(prediction))
+class TesManualOptimizationDDPModel(BoringModel):
 
-        def manual_sync_grad(self) -> bool:
-            torch_distrib.all_reduce(self.layer.weight.grad.data, async_op=False)
-            return True
+    def __init__(self):
+        super().__init__()
+        self.automatic_optimization = False
 
-        def training_step(self, batch, batch_idx, optimizer_idx):
+    def loss_ones(self, batch, prediction):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
 
-            # emulate gans training
-            opt_gen, opt_dis = self.optimizers()
+    def loss_zeros(self, batch, prediction):
+        # An arbitrary loss to have a loss that updates the model weights during `Trainer.fit` calls
+        return torch.nn.functional.mse_loss(prediction, torch.zeros_like(prediction))
 
-            # Note: Be careful, don't log on the same key in self.log in both closure
-            # as they will be aggregated together on epoch_end
+    def manual_sync_grad(self) -> bool:
+        torch_distrib.all_reduce(self.layer.weight.grad.data, async_op=False)
+        return True
 
-            world_size = torch_distrib.get_world_size(torch_distrib.group.WORLD)
-            assert world_size == 2
+    def training_step(self, batch, batch_idx, optimizer_idx):
 
-            def compute_loss():
-                x = batch[0]
-                x = F.dropout(x, 0.1)
-                predictions = self(x)
-                predictions = F.dropout(predictions, 0.1)
-                loss_ones = self.loss_ones(None, predictions)
-                loss_zeros = self.loss_zeros(None, predictions)
-                return loss_ones, loss_zeros
+        # emulate gans training
+        opt_gen, opt_dis = self.optimizers()
+
+        # Note: Be careful, don't log on the same key in self.log in both closure
+        # as they will be aggregated together on epoch_end
+
+        world_size = torch_distrib.get_world_size(torch_distrib.group.WORLD)
+        assert world_size == 2
 
-            def make_manual_backward(loss, opt, retain_graph=False):
-                self.manual_backward(loss, opt, retain_graph=retain_graph)
+        make_gen_optimizer_step = batch_idx % 2 == 1
+        make_dis_optimizer_step = batch_idx % 4 == 0
+
+        def compute_loss():
+            x = batch[0]
+            x = F.dropout(x, 0.1)
+            predictions = self(x)
+            predictions = F.dropout(predictions, 0.1)
+            loss_ones = self.loss_ones(None, predictions)
+            loss_zeros = self.loss_zeros(None, predictions)
+            return loss_ones, loss_zeros
+
+        def make_manual_backward(loss, opt, retain_graph=False, make_optimizer_step=True):
+            self.manual_backward(loss, opt, retain_graph=retain_graph)
+            if make_optimizer_step:
                 grad_clone = self.layer.weight.grad.clone()
                 assert self.manual_sync_grad()
                 self.layer.weight.grad /= world_size
                 assert torch.equal(self.layer.weight.grad, grad_clone)
 
-            def gen_closure():
-                loss_ones_gen, loss_zeros = compute_loss()
-                make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True)
-                make_manual_backward(loss_ones_gen, opt_gen)
+        def gen_closure():
+            loss_ones_gen, loss_zeros = compute_loss()
+            make_manual_backward(loss_ones_gen, opt_gen, retain_graph=True, make_optimizer_step=make_gen_optimizer_step)
+            make_manual_backward(loss_ones_gen, opt_gen, make_optimizer_step=make_gen_optimizer_step)
 
-            def dis_closure():
-                loss_ones_gen, loss_zeros = compute_loss()
-                make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True)
-                make_manual_backward(loss_ones_gen, opt_dis)
+        def dis_closure():
+            loss_ones_gen, loss_zeros = compute_loss()
+            make_manual_backward(loss_ones_gen, opt_dis, retain_graph=True, make_optimizer_step=make_dis_optimizer_step)
+            make_manual_backward(loss_ones_gen, opt_dis, make_optimizer_step=make_dis_optimizer_step)
 
-            # this will accumulate gradients for 2 batches and then call opt_gen.step()
-            opt_gen.step(closure=gen_closure, make_optimizer_step=batch_idx % 2 == 0, optim='sgd')
+        # this will accumulate gradients for 2 batches and then call opt_gen.step()
+        opt_gen.step(closure=gen_closure, make_optimizer_step=make_gen_optimizer_step)
 
-            # update discriminator every 4 baches
-            # therefore, no gradient accumulation for discriminator
-            if batch_idx % 4 == 0:
-                # Note: Set make_optimizer_step to True or it will use by default
-                # Trainer(accumulate_grad_batches=x)
-                opt_dis.step(closure=dis_closure, make_optimizer_step=True, optim='adam')
+        # update discriminator every 4 baches
+        # therefore, no gradient accumulation for discriminator
+        if make_dis_optimizer_step:
+            # Note: Set make_optimizer_step to True or it will use by default
+            # Trainer(accumulate_grad_batches=x)
+            opt_dis.step(closure=dis_closure, make_optimizer_step=True)
 
-        def training_epoch_end(self, outputs) -> None:
-            # outputs should be an array with an entry per optimizer
-            assert len(outputs) == 2
+    def training_epoch_end(self, outputs) -> None:
+        # outputs should be an array with an entry per optimizer
+        assert len(outputs) == 2
+
+    def configure_optimizers(self):
+        optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+        optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
+        return [optimizer_gen, optimizer_dis]
 
-        def configure_optimizers(self):
-            optimizer_gen = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            optimizer_dis = torch.optim.Adam(self.layer.parameters(), lr=0.001)
-            return [optimizer_gen, optimizer_dis]
+
+def train_manual_optimization(tmpdir, accelerator):
 
     seed_everything(42)
 
-    model = TestModel()
+    model = TesManualOptimizationDDPModel()
+    model_copy = deepcopy(model)
     model.val_dataloader = None
     model.training_epoch_end = None
 
@@ -1048,12 +1055,32 @@ def configure_optimizers(self):
         log_every_n_steps=1,
         accumulate_grad_batches=2,
         gpus=2,
-        accelerator="ddp",
+        accelerator=accelerator,
+        callbacks=[TestManualOptimizationDDPCallack()]
     )
 
     trainer.fit(model)
-    expected_calls = [call(closure=ANY, optim='sgd')] * 4
-    mock_sgd_step.assert_has_calls(expected_calls)
 
-    expected_calls = [call(closure=ANY, optim='adam')] * 2
-    mock_adam_step.assert_has_calls(expected_calls)
+    for param, param_copy in zip(model.parameters(), model_copy.parameters()):
+        assert not torch.equal(param.cpu().data, param_copy.data)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
+def test_step_with_optimizer_closure_with_different_frequencies_ddp(tmpdir):
+    """
+    Tests that `step` works with optimizer_closure and different accumulated_gradient frequency
+    """
+
+    train_manual_optimization(tmpdir, "ddp")
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_step_with_optimizer_closure_with_different_frequencies_ddp_spawn(tmpdir):
+    """
+    Tests that `step` works with optimizer_closure and different accumulated_gradient frequency
+    """
+
+    train_manual_optimization(tmpdir, "ddp_spawn")
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 96ca450783495..da3c6fd5398ad 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -131,7 +131,7 @@ def test_multiple_val_dataloader(tmpdir):
 
     # make sure predictions are good for each val set
     for dataloader in trainer.val_dataloaders:
-        tpipes.run_prediction(trainer.model, dataloader)
+        tpipes.run_prediction(trained_model=model, dataloader=dataloader)
 
 
 @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 823d1061a67c1..9814e5e87f87c 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1550,23 +1550,31 @@ def test_trainer_predict_dp(tmpdir, num_gpus):
 @pytest.mark.skipif(
     not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
 )
-@pytest.mark.parametrize('plugins', [None, "ddp_sharded"])
-def test_trainer_predict_ddp(tmpdir, plugins):
-    predict(tmpdir, "ddp", 2, None, plugins=plugins)
+def test_trainer_predict_ddp(tmpdir):
+    predict(tmpdir, "ddp", 2, None, plugins=["ddp_sharded"])
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
 def test_trainer_predict_ddp_spawn(tmpdir):
     predict(tmpdir, "ddp_spawn", 2, None)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="test requires GPU machine")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
 def test_trainer_predict_1_gpu(tmpdir):
     predict(tmpdir, None, 1, None)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
+@pytest.mark.skipif(
+    not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
+)
 def test_trainer_predict_ddp_cpu(tmpdir):
     predict(tmpdir, "ddp_cpu", 0, 2)
 
@@ -1731,3 +1739,47 @@ def training_epoch_end(self, *args, **kwargs):
     assert trainer.current_epoch == current_epoch
     assert model.training_step_invoked == should_train, f"`training_step` {error_string}"
     assert model.training_epoch_end_invoked == should_train, f"`training_epoch_end` {error_string}"
+
+
+def test_trainer_access_in_configure_optimizers(tmpdir):
+    """
+    Verify that the configure optimizer function can reference the trainer.
+    """
+
+    class TestModel(BoringModel):
+
+        def configure_optimizers(self):
+            assert self.trainer is not None, "Expect to have access to the trainer within `configure_optimizers`"
+
+    train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    trainer.fit(model, train_data)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+def test_setup_hook_move_to_device_correctly(tmpdir):
+    """
+    Verify that if a user defines a layer in the setup hook function, this is moved to the correct device.
+    """
+
+    class TestModel(BoringModel):
+
+        def setup(self, stage: str) -> None:
+            self.new_layer = torch.nn.Linear(2, 2)
+
+        def training_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            # will crash if not moved to correct device
+            output = self.new_layer(output)
+            loss = self.loss(batch, output)
+            return {"loss": loss}
+
+    # fake data
+    train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))
+
+    # model
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=1)
+    trainer.fit(model, train_data)

From b8619a695f4e1f9a91894badb903ceaa61ea7201 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 13 Feb 2021 01:27:44 +0100
Subject: [PATCH 21/34] new LightningModule hook "configure_callbacks" (#5621)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 CHANGELOG.md                                  |  3 +
 docs/source/common/lightning_module.rst       |  6 ++
 docs/source/common/trainer.rst                |  8 ++
 pytorch_lightning/core/lightning.py           | 26 ++++++
 .../trainer/connectors/callback_connector.py  | 36 +++++++-
 pytorch_lightning/trainer/trainer.py          |  1 +
 tests/callbacks/test_callbacks.py             | 92 ++++++++++++++++++-
 tests/trainer/connectors/__init__.py          |  0
 .../connectors/test_callback_connector.py     | 88 +++++++++++++++++-
 9 files changed, 256 insertions(+), 4 deletions(-)
 create mode 100644 tests/trainer/connectors/__init__.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e51b09899a1fb..5d2b710c4d343 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -80,6 +80,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `QuantizationAwareTraining` callback ([#5706](https://github.com/PyTorchLightning/pytorch-lightning/pull/5706))
 
 
+- Added `LightningModule.configure_callbacks` to enable the definition of model-specific callbacks ([#5621](https://github.com/PyTorchLightning/pytorch-lightning/pull/5621))
+
+
 ### Changed
 
 - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
index b311507a860a7..943525902f41b 100644
--- a/docs/source/common/lightning_module.rst
+++ b/docs/source/common/lightning_module.rst
@@ -657,6 +657,12 @@ LightningModule API
 Methods
 ^^^^^^^
 
+configure_callbacks
+~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.configure_callbacks
+    :noindex:
+
 configure_optimizers
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index e759262ed8ba4..35b719505febd 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -535,6 +535,14 @@ Example::
         def on_train_end(self, trainer, pl_module):
             print("Training is done.")
 
+
+Model-specific callbacks can also be added inside the ``LightningModule`` through
+:meth:`~pytorch_lightning.core.lightning.LightningModule.configure_callbacks`.
+Callbacks returned in this hook will extend the list initially given to the ``Trainer`` argument, and replace
+the trainer callbacks should there be two or more of the same type.
+:class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` callbacks always run last.
+
+
 check_val_every_n_epoch
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 59bd10c042018..aa7f909d9b682 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1042,6 +1042,32 @@ def predict(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = No
         """
         return self(batch)
 
+    def configure_callbacks(self):
+        """
+        Configure model-specific callbacks.
+        When the model gets attached, e.g., when ``.fit()`` or ``.test()`` gets called,
+        the list returned here will be merged with the list of callbacks passed to the Trainer's ``callbacks`` argument.
+        If a callback returned here has the same type as one or several callbacks already present in
+        the Trainer's callbacks list, it will take priority and replace them.
+        In addition, Lightning will make sure :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`
+        callbacks run last.
+
+        Return:
+            A list of callbacks which will extend the list of callbacks in the Trainer.
+
+        Example::
+
+            def configure_callbacks(self):
+                early_stop = EarlyStopping(monitor"val_acc", mode="max")
+                checkpoint = ModelCheckpoint(monitor="val_loss")
+                return [early_stop, checkpoint]
+
+        Note:
+            Certain callback methods like :meth:`~pytorch_lightning.callbacks.base.Callback.on_init_start`
+            will never be invoked on the new callbacks returned here.
+        """
+        return []
+
     def configure_optimizers(self):
         r"""
         Choose what optimizers and learning-rate schedulers to use in your optimization.
diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py
index 649d8379bbb0e..6ea75c23febf8 100644
--- a/pytorch_lightning/trainer/connectors/callback_connector.py
+++ b/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -15,7 +15,8 @@
 from typing import List, Union
 
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint, ProgressBar, ProgressBarBase
-from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -108,6 +109,39 @@ def attach_model_logging_functions(self, model):
             callback.log = model.log
             callback.log_dict = model.log_dict
 
+    @staticmethod
+    def _attach_model_callbacks(model: LightningModule, trainer) -> None:
+        """
+        Attaches the callbacks defined in the model.
+        If a callback returned by the model's configure_callback method has the same type as one or several
+        callbacks already present in the trainer callbacks list, it will replace them.
+        In addition, all :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` callbacks
+        will be pushed to the end of the list, ensuring they run last.
+
+        Args:
+            model: A model which may or may not define new callbacks in
+                :meth:`~pytorch_lightning.core.lightning.LightningModule.configure_callbacks`.
+            trainer: The trainer on which the callbacks get attached/merged.
+        """
+        model_callbacks = model.configure_callbacks()
+        if not model_callbacks:
+            return
+        model_callback_types = set(type(c) for c in model_callbacks)
+        trainer_callback_types = set(type(c) for c in trainer.callbacks)
+        override_types = model_callback_types.intersection(trainer_callback_types)
+        if override_types:
+            rank_zero_info(
+                "The following callbacks returned in `LightningModule.configure_callbacks` will override"
+                " existing callbacks passed to Trainer:"
+                f" {', '.join(sorted(t.__name__ for t in override_types))}"
+            )
+        # remove all callbacks with a type that occurs in model callbacks
+        all_callbacks = [c for c in trainer.callbacks if type(c) not in override_types]
+        all_callbacks.extend(model_callbacks)
+        all_callbacks = CallbackConnector._reorder_callbacks(all_callbacks)
+        # TODO: connectors refactor: move callbacks list to connector and do not write Trainer state
+        trainer.callbacks = all_callbacks
+
     @staticmethod
     def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]:
         """
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 1239ac4913ff5..4f9c5d4f5e19f 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -457,6 +457,7 @@ def fit(
 
         # hook
         self.data_connector.prepare_data(model)
+        self.callback_connector._attach_model_callbacks(model, self)
 
         # ----------------------------
         # SET UP TRAINING
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 060d42fd5edc3..8bb6d3c8dc815 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from unittest import mock
-from unittest.mock import ANY, call, MagicMock
+from unittest.mock import ANY, call, MagicMock, Mock
 
 from pytorch_lightning import Trainer
 from tests.helpers import BoringModel
 
 
 @mock.patch("torch.save")  # need to mock torch.save or we get pickle error
-def test_trainer_callback_system(torch_save):
+def test_trainer_callback_system(torch_save, tmpdir):
     """Test the callback system."""
 
     model = BoringModel()
@@ -27,6 +27,7 @@ def test_trainer_callback_system(torch_save):
     callback_mock = MagicMock()
 
     trainer_options = dict(
+        default_root_dir=tmpdir,
         callbacks=[callback_mock],
         max_epochs=1,
         limit_val_batches=1,
@@ -123,3 +124,90 @@ def test_trainer_callback_system(torch_save):
         call.teardown(trainer, model, 'fit'),
         call.teardown(trainer, model, 'test'),
     ]
+
+
+def test_callbacks_configured_in_model(tmpdir):
+    """ Test the callback system with callbacks added through the model hook. """
+
+    model_callback_mock = Mock()
+    trainer_callback_mock = Mock()
+
+    class TestModel(BoringModel):
+
+        def configure_callbacks(self):
+            return [model_callback_mock]
+
+    model = TestModel()
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        checkpoint_callback=False,
+        fast_dev_run=True,
+        progress_bar_refresh_rate=0,
+    )
+
+    def assert_expected_calls(_trainer, model_callback, trainer_callback):
+        # some methods in callbacks configured through model won't get called
+        uncalled_methods = [
+            call.on_init_start(_trainer),
+            call.on_init_end(_trainer),
+        ]
+        for uncalled in uncalled_methods:
+            assert uncalled not in model_callback.method_calls
+
+        # assert that the rest of calls are the same as for trainer callbacks
+        expected_calls = [m for m in trainer_callback.method_calls if m not in uncalled_methods]
+        assert expected_calls
+        assert model_callback.method_calls == expected_calls
+
+    # .fit()
+    trainer_options.update(callbacks=[trainer_callback_mock])
+    trainer = Trainer(**trainer_options)
+    assert trainer_callback_mock in trainer.callbacks
+    assert model_callback_mock not in trainer.callbacks
+    trainer.fit(model)
+    assert model_callback_mock in trainer.callbacks
+    assert trainer.callbacks[-1] == model_callback_mock
+    assert_expected_calls(trainer, model_callback_mock, trainer_callback_mock)
+
+    # .test()
+    model_callback_mock.reset_mock()
+    trainer_callback_mock.reset_mock()
+    trainer_options.update(callbacks=[trainer_callback_mock])
+    trainer = Trainer(**trainer_options)
+    trainer.test(model)
+    assert model_callback_mock in trainer.callbacks
+    assert trainer.callbacks[-1] == model_callback_mock
+    assert_expected_calls(trainer, model_callback_mock, trainer_callback_mock)
+
+
+def test_configure_callbacks_hook_multiple_calls(tmpdir):
+    """ Test that subsequent calls to `configure_callbacks` do not change the callbacks list. """
+    model_callback_mock = Mock()
+
+    class TestModel(BoringModel):
+
+        def configure_callbacks(self):
+            return [model_callback_mock]
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        checkpoint_callback=False,
+        progress_bar_refresh_rate=1,
+    )
+
+    callbacks_before_fit = trainer.callbacks.copy()
+    assert callbacks_before_fit
+
+    trainer.fit(model)
+    callbacks_after_fit = trainer.callbacks.copy()
+    assert callbacks_after_fit == callbacks_before_fit + [model_callback_mock]
+
+    trainer.test(model)
+    callbacks_after_test = trainer.callbacks.copy()
+    assert callbacks_after_test == callbacks_after_fit
+
+    trainer.test(ckpt_path=None)
+    callbacks_after_test = trainer.callbacks.copy()
+    assert callbacks_after_test == callbacks_after_fit
diff --git a/tests/trainer/connectors/__init__.py b/tests/trainer/connectors/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/trainer/connectors/test_callback_connector.py b/tests/trainer/connectors/test_callback_connector.py
index 930262fc2e6ed..a472f4398c967 100644
--- a/tests/trainer/connectors/test_callback_connector.py
+++ b/tests/trainer/connectors/test_callback_connector.py
@@ -1,9 +1,17 @@
+import logging
 from unittest.mock import Mock
 
 import torch
 
 from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, ProgressBar
+from pytorch_lightning.callbacks import (
+    EarlyStopping,
+    GradientAccumulationScheduler,
+    LearningRateMonitor,
+    ModelCheckpoint,
+    ProgressBar,
+)
+from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
 from tests.helpers import BoringModel
 
 
@@ -11,14 +19,26 @@ def test_checkpoint_callbacks_are_last(tmpdir):
     """ Test that checkpoint callbacks always get moved to the end of the list, with preserved order. """
     checkpoint1 = ModelCheckpoint(tmpdir)
     checkpoint2 = ModelCheckpoint(tmpdir)
+    early_stopping = EarlyStopping()
     lr_monitor = LearningRateMonitor()
     progress_bar = ProgressBar()
 
+    # no model callbacks
     model = Mock()
     model.configure_callbacks.return_value = []
     trainer = Trainer(callbacks=[checkpoint1, progress_bar, lr_monitor, checkpoint2])
+    cb_connector = CallbackConnector(trainer)
+    cb_connector._attach_model_callbacks(model, trainer)
     assert trainer.callbacks == [progress_bar, lr_monitor, checkpoint1, checkpoint2]
 
+    # with model-specific callbacks that substitute ones in Trainer
+    model = Mock()
+    model.configure_callbacks.return_value = [checkpoint1, early_stopping, checkpoint2]
+    trainer = Trainer(callbacks=[progress_bar, lr_monitor, ModelCheckpoint(tmpdir)])
+    cb_connector = CallbackConnector(trainer)
+    cb_connector._attach_model_callbacks(model, trainer)
+    assert trainer.callbacks == [progress_bar, lr_monitor, early_stopping, checkpoint1, checkpoint2]
+
 
 class StatefulCallback0(Callback):
 
@@ -53,3 +73,69 @@ def test_all_callback_states_saved_before_checkpoint_callback(tmpdir):
     assert "content0" in state0 and state0["content0"] == 0
     assert "content1" in state1 and state1["content1"] == 1
     assert type(checkpoint_callback) in ckpt["callbacks"]
+
+
+def test_attach_model_callbacks():
+    """ Test that the callbacks defined in the model and through Trainer get merged correctly. """
+
+    def assert_composition(trainer_callbacks, model_callbacks, expected):
+        model = Mock()
+        model.configure_callbacks.return_value = model_callbacks
+        trainer = Trainer(checkpoint_callback=False, progress_bar_refresh_rate=0, callbacks=trainer_callbacks)
+        cb_connector = CallbackConnector(trainer)
+        cb_connector._attach_model_callbacks(model, trainer)
+        assert trainer.callbacks == expected
+
+    early_stopping = EarlyStopping()
+    progress_bar = ProgressBar()
+    lr_monitor = LearningRateMonitor()
+    grad_accumulation = GradientAccumulationScheduler({1: 1})
+
+    # no callbacks
+    assert_composition(trainer_callbacks=[], model_callbacks=[], expected=[])
+
+    # callbacks of different types
+    assert_composition(
+        trainer_callbacks=[early_stopping], model_callbacks=[progress_bar], expected=[early_stopping, progress_bar]
+    )
+
+    # same callback type twice, different instance
+    assert_composition(
+        trainer_callbacks=[progress_bar, EarlyStopping()],
+        model_callbacks=[early_stopping],
+        expected=[progress_bar, early_stopping]
+    )
+
+    # multiple callbacks of the same type in trainer
+    assert_composition(
+        trainer_callbacks=[LearningRateMonitor(),
+                           EarlyStopping(),
+                           LearningRateMonitor(),
+                           EarlyStopping()],
+        model_callbacks=[early_stopping, lr_monitor],
+        expected=[early_stopping, lr_monitor]
+    )
+
+    # multiple callbacks of the same type, in both trainer and model
+    assert_composition(
+        trainer_callbacks=[
+            LearningRateMonitor(), progress_bar,
+            EarlyStopping(),
+            LearningRateMonitor(),
+            EarlyStopping()
+        ],
+        model_callbacks=[early_stopping, lr_monitor, grad_accumulation, early_stopping],
+        expected=[progress_bar, early_stopping, lr_monitor, grad_accumulation, early_stopping]
+    )
+
+
+def test_attach_model_callbacks_override_info(caplog):
+    """ Test that the logs contain the info about overriding callbacks returned by configure_callbacks. """
+    model = Mock()
+    model.configure_callbacks.return_value = [LearningRateMonitor(), EarlyStopping()]
+    trainer = Trainer(checkpoint_callback=False, callbacks=[EarlyStopping(), LearningRateMonitor(), ProgressBar()])
+    cb_connector = CallbackConnector(trainer)
+    with caplog.at_level(logging.INFO):
+        cb_connector._attach_model_callbacks(model, trainer)
+
+    assert "existing callbacks passed to Trainer: EarlyStopping, LearningRateMonitor" in caplog.text

From 42dc5d2af1ef00fe9725f27033549652a2ab4faa Mon Sep 17 00:00:00 2001
From: Kaushik B <45285388+kaushikb11@users.noreply.github.com>
Date: Sat, 13 Feb 2021 13:06:22 +0530
Subject: [PATCH 22/34] Fix: Repeated .fit() calls ignore max_steps iteration
 bound  (#5936)

* fix repeated fit calls ignoring max_steps

* fix fast dev progress bar
---
 pytorch_lightning/trainer/training_loop.py |  5 +++--
 tests/callbacks/test_progress_bar.py       |  2 --
 tests/trainer/test_trainer.py              | 26 ++++++++++++++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 03a72eb71ab84..f727a15310a84 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -75,7 +75,7 @@ def on_trainer_init(
 
         # If neither max_epochs or max_steps is set, then use existing default of max_epochs = 1000
         self.trainer.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs
-        # If neither max_epochs or max_steps is set, then use existing default of min_epochs = 1
+        # If neither min_epochs or min_steps is set, then use existing default of min_epochs = 1
         self.trainer.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs
         self.trainer.max_steps = max_steps
         self.trainer.min_steps = min_steps
@@ -97,8 +97,9 @@ def num_optimizers(self):
         return num_optimizers
 
     def should_skip_training(self):
+        should_by_max_steps = self.trainer.max_steps is not None and self.trainer.global_step >= self.trainer.max_steps
         should_by_epoch = self.trainer.max_epochs is not None and self.trainer.current_epoch >= self.trainer.max_epochs
-        return should_by_epoch or self.trainer.num_training_batches == 0
+        return should_by_max_steps or should_by_epoch or self.trainer.num_training_batches == 0
 
     def on_train_start(self):
         # hook
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index 5f861d7a2cce9..802b43bbd034d 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -146,8 +146,6 @@ def test_progress_bar_fast_dev_run(tmpdir):
     assert 1 == progress_bar.total_train_batches
     # total val batches are known only after val dataloaders have reloaded
 
-    trainer.fit(model)
-
     assert 1 == progress_bar.total_val_batches
     assert 1 == progress_bar.train_batch_idx
     assert 1 == progress_bar.val_batch_idx
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9814e5e87f87c..4e85a5695b9f2 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1741,6 +1741,32 @@ def training_epoch_end(self, *args, **kwargs):
     assert model.training_epoch_end_invoked == should_train, f"`training_epoch_end` {error_string}"
 
 
+@pytest.mark.parametrize(["max_steps", "max_epochs", "global_step"], [(10, 5, 10), (20, None, 20)])
+def test_repeated_fit_calls_with_max_epochs_and_steps(tmpdir, max_steps, max_epochs, global_step):
+    """
+    Ensure that the training loop is bound by `max_steps` and
+    `max_epochs` for repeated calls of `trainer.fit`, and
+    disabled if the limit is reached
+    """
+
+    dataset_len = 200
+    batch_size = 10
+
+    train_data = DataLoader(RandomDataset(32, dataset_len), batch_size=batch_size)
+
+    model = BoringModel()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_steps=max_steps,
+        max_epochs=max_epochs,
+    )
+    trainer.fit(model, train_data)
+    assert trainer.global_step == global_step
+    trainer.fit(model, train_data)
+    assert trainer.global_step == global_step
+
+
 def test_trainer_access_in_configure_optimizers(tmpdir):
     """
     Verify that the configure optimizer function can reference the trainer.

From 046ac714f6955ed14b831657ea1b7b16bc28ac93 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Sat, 13 Feb 2021 13:58:25 +0100
Subject: [PATCH 23/34] v1.2.0rc1 (#5946)

* v1.2.0rc0

* chlog

* chlog

* chlog

* chlog

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 CHANGELOG.md                  | 94 ++++++++++++++++++++++++++++++++---
 pytorch_lightning/__init__.py |  2 +-
 2 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d2b710c4d343..306500c3e6f42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
-## [unreleased.Features] - YYYY-MM-DD
+## [1.2] - YYYY-MM-DD
 
 ### Added
 
@@ -59,10 +59,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/PyTorchLightning/pytorch-lightning/pull/5467))
 
 
-- `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842))
+- The `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842))
 
 
-- Added `ModelPruning` Callback ([#5618](https://github.com/PyTorchLightning/pytorch-lightning/pull/5618))
+- Added `ModelPruning` Callback ([#5618](https://github.com/PyTorchLightning/pytorch-lightning/pull/5618),
+    [#5825](https://github.com/PyTorchLightning/pytorch-lightning/pull/5825))
 
 
 - Added `PyTorchProfiler` ([#5560](https://github.com/PyTorchLightning/pytorch-lightning/pull/5560))
@@ -83,6 +84,24 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `LightningModule.configure_callbacks` to enable the definition of model-specific callbacks ([#5621](https://github.com/PyTorchLightning/pytorch-lightning/pull/5621))
 
 
+- Added promxial policy optimization template to pl_examples ([#5394](https://github.com/PyTorchLightning/pytorch-lightning/pull/5394))
+
+
+- Added `log_graph` to `CometLogger` ([#5295](https://github.com/PyTorchLightning/pytorch-lightning/pull/5295))
+
+
+- Added possibility for nested loaders ([#5404](https://github.com/PyTorchLightning/pytorch-lightning/pull/5404))
+
+
+- Added `sync_step` to Wandb logger ([#5351](https://github.com/PyTorchLightning/pytorch-lightning/pull/5351))
+
+
+- Added `StochasticWeightAveraging` callback ([#5640](https://github.com/PyTorchLightning/pytorch-lightning/pull/5640))
+
+
+- Added `LightningDataModule.from_datasets(...)` ([#5133](https://github.com/PyTorchLightning/pytorch-lightning/pull/5133))
+
+
 ### Changed
 
 - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
@@ -127,15 +146,50 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Forced `ModelCheckpoint` callbacks to run after all others to guarantee all states are saved to the checkpoint ([#5731](https://github.com/PyTorchLightning/pytorch-lightning/pull/5731))
 
 
-- Refactored Accelerators and Plugins
+- Refactored Accelerators and Plugins ([#5743](https://github.com/PyTorchLightning/pytorch-lightning/pull/5743))
     * Added base classes for plugins ([#5715](https://github.com/PyTorchLightning/pytorch-lightning/pull/5715))
     * Added parallel plugins for DP, DDP, DDPSpawn, DDP2 and Horovod ([#5714](https://github.com/PyTorchLightning/pytorch-lightning/pull/5714))
+    * Precision Plugins ([#5718](https://github.com/PyTorchLightning/pytorch-lightning/pull/5718))
     * Added new Accelerators for CPU, GPU and TPU ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719))
     * Added Plugins for TPU training ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719))
     * Added RPC and Sharded plugins ([#5732](https://github.com/PyTorchLightning/pytorch-lightning/pull/5732))
     * Added missing `LightningModule`-wrapper logic to new plugins and accelerator ([#5734](https://github.com/PyTorchLightning/pytorch-lightning/pull/5734))
 
 
+- Enabled `self.log` in callbacks ([#5094](https://github.com/PyTorchLightning/pytorch-lightning/pull/5094))
+
+
+- Renamed xxx_AVAILABLE as protected ([#5082](https://github.com/PyTorchLightning/pytorch-lightning/pull/5082))
+
+
+- Unified module names in Utils ([#5199](https://github.com/PyTorchLightning/pytorch-lightning/pull/5199))
+
+
+- Separated utils: imports & enums ([#5256](https://github.com/PyTorchLightning/pytorch-lightning/pull/5256)
+    [#5874](https://github.com/PyTorchLightning/pytorch-lightning/pull/5874))
+
+
+- Refactor: clean trainer device & distributed getters ([#5300](https://github.com/PyTorchLightning/pytorch-lightning/pull/5300))
+
+
+- Simplified training phase as LightningEnum ([#5419](https://github.com/PyTorchLightning/pytorch-lightning/pull/5419))
+
+
+- Updated metrics to use LightningEnum ([#5689](https://github.com/PyTorchLightning/pytorch-lightning/pull/5689))
+
+
+- Changed the seq of `on_train_batch_end`, `on_batch_end` & `on_train_epoch_end`, `on_epoch_end hooks` ([#5688](https://github.com/PyTorchLightning/pytorch-lightning/pull/5688))
+
+
+- Refactored `setup_training` and remove `test_mode` ([#5388](https://github.com/PyTorchLightning/pytorch-lightning/pull/5388))
+
+
+- Disabled training with zero `num_training_batches` when insufficient `limit_train_batches` ([#5703](https://github.com/PyTorchLightning/pytorch-lightning/pull/5703))
+
+
+- Refactored `EpochResultStore` ([#5522](https://github.com/PyTorchLightning/pytorch-lightning/pull/5522))
+
+
 ### Deprecated
 
 - Function `stat_scores_multiple_classes` is deprecated in favor of `stat_scores` ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
@@ -150,6 +204,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `LightningDataParallel` in favor of new wrapper module `LightningParallelModule` ([#5670](https://github.com/PyTorchLightning/pytorch-lightning/pull/5670))
 
 
+- Renamed utils modules ([#5199](https://github.com/PyTorchLightning/pytorch-lightning/pull/5199))
+    * `argparse_utils` >> `argparse`
+    * `model_utils` >> `model_helpers`
+    * `warning_utils` >> `warnings`
+    * `xla_device_utils` >> `xla_device`
+
+
 ### Removed
 
 - Removed deprecated checkpoint argument `filepath` ([#5321](https://github.com/PyTorchLightning/pytorch-lightning/pull/5321))
@@ -187,6 +248,27 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed support custom DataLoader with DDP if they can be re-instantiated ([#5745](https://github.com/PyTorchLightning/pytorch-lightning/pull/5745))
 
 
+- Fixed repeated `.fit()` calls ignore max_steps iteration bound ([#5936](https://github.com/PyTorchLightning/pytorch-lightning/pull/5936))
+
+
+- Fixed throwing `MisconfigurationError` on unknown mode ([#5255](https://github.com/PyTorchLightning/pytorch-lightning/pull/5255))
+
+
+- Resolve bug with Finetuning ([#5744](https://github.com/PyTorchLightning/pytorch-lightning/pull/5744))
+
+
+- Fixed `ModelCheckpoint` race condition in file existence check ([#5155](https://github.com/PyTorchLightning/pytorch-lightning/pull/5155))
+
+
+- Fixed some compatibility with PyTorch 1.8 ([#5864](https://github.com/PyTorchLightning/pytorch-lightning/pull/5864))
+
+
+- Fixed forward cache ([#5895](https://github.com/PyTorchLightning/pytorch-lightning/pull/5895))
+
+
+- Fixed passing wrong strings for scheduler interval doesn't throw an error ([#5923](https://github.com/PyTorchLightning/pytorch-lightning/pull/5923))
+
+
 ## [1.1.8] - 2021-02-08
 
 ### Fixed
@@ -429,7 +511,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed checkpoint `hparams` dict casting when `omegaconf` is available ([#4770](https://github.com/PyTorchLightning/pytorch-lightning/pull/4770))
 - Fixed incomplete progress bars when total batches not divisible by refresh rate ([#4577](https://github.com/PyTorchLightning/pytorch-lightning/pull/4577))
-- Updated SSIM metric (#4566)([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656))
+- Updated SSIM metric ([#4566](https://github.com/PyTorchLightning/pytorch-lightning/pull/4566))
 - Fixed batch_arg_name - add `batch_arg_name` to all calls to `_adjust_batch_size`bug ([#4812](https://github.com/PyTorchLightning/pytorch-lightning/pull/4812))
 - Fixed `torchtext` data to GPU ([#4785](https://github.com/PyTorchLightning/pytorch-lightning/pull/4785))
 - Fixed a crash bug in MLFlow logger ([#4716](https://github.com/PyTorchLightning/pytorch-lightning/pull/4716))
@@ -1130,7 +1212,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
   * Regression metrics ([#2221](https://github.com/PyTorchLightning/pytorch-lightning/pull/2221))
 - Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723))
 - Allow dataloaders without sampler field present ([#1907](https://github.com/PyTorchLightning/pytorch-lightning/pull/1907))
-- Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` [(#1908)](https://github.com/PyTorchLightning/pytorch-lightning/pull/1908)
+- Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` ([#1908](https://github.com/PyTorchLightning/pytorch-lightning/pull/1908))
 - Early stopping checks `on_validation_end` ([#1458](https://github.com/PyTorchLightning/pytorch-lightning/pull/1458))
 - Attribute `best_model_path` to `ModelCheckpoint` for storing and later retrieving the path to the best saved model file ([#1799](https://github.com/PyTorchLightning/pytorch-lightning/pull/1799))
 - Speed up single-core TPU training by loading data using `ParallelLoader` ([#2033](https://github.com/PyTorchLightning/pytorch-lightning/pull/2033))
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 5f115ef98fbb1..be2756ebf4bd6 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -5,7 +5,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.2.0dev'
+__version__ = '1.2.0rc1'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From d81851843ea89913ed1b222c5ef0b751435a54a0 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 11:51:33 -0500
Subject: [PATCH 24/34] Update README.md

---
 README.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/README.md b/README.md
index cf9107b004f32..8b19e4cd85c8f 100644
--- a/README.md
+++ b/README.md
@@ -41,11 +41,6 @@ Scale your models, not the boilerplate.**
 
 ---
 
-## NEWS
-[Dec 2020 - Read about how Facebook uses Lightning to standardize deep learning across research and production teams](https://ai.facebook.com/blog/reengineering-facebook-ais-deep-learning-platforms-for-interoperability)
-
----
-
 ## PyTorch Lightning is just organized PyTorch
 Lightning disentangles PyTorch code to decouple the science from the engineering.
 ![PT to PL](docs/source/_static/images/general/pl_quick_start_full_compressed.gif)

From 7e69fbbc2e064f12e35b21064caf43c340ea2a4e Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 11:52:02 -0500
Subject: [PATCH 25/34] Update README.md

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 8b19e4cd85c8f..06d8d2e697613 100644
--- a/README.md
+++ b/README.md
@@ -50,10 +50,10 @@ Lightning disentangles PyTorch code to decouple the science from the engineering
 ## Lightning Philosophy
 Lightning is designed with these principles in mind:
 
-Principle 1: Enable maximal flexibility.
-Principle 2: Abstract away unnecessary boilerplate, but make it accessible when needed.
-Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc).
-Principle 4: Deep learning code should be organized into 4 distinct categories.
+Principle 1: Enable maximal flexibility.   
+Principle 2: Abstract away unnecessary boilerplate, but make it accessible when needed.   
+Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc).   
+Principle 4: Deep learning code should be organized into 4 distinct categories.   
 
   - Research code (the LightningModule).
   - Engineering code (you delete, and is handled by the Trainer).

From 68aac1f9dfeb63980819a2262643430d8da59a8b Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 12:13:14 -0500
Subject: [PATCH 26/34] Update README.md

---
 README.md | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 06d8d2e697613..020236da3b3d6 100644
--- a/README.md
+++ b/README.md
@@ -107,28 +107,32 @@ From Conda
 conda install pytorch-lightning -c conda-forge
 ```
 
-<!-- following section will be skipped from PyPI description -->
+<details>
+  <summary>Other options</summary>
+    <!-- following section will be skipped from PyPI description -->
 
-#### Install bleeding-edge - future 1.2
+  #### Install bleeding-edge - future 1.2
 
-the actual status of 1.2 [nightly] is following:
+  the actual status of 1.2 [nightly] is following:
 
-![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
-![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
-![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2-dev&event=push)
-![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push)
-![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push)
 
-Install future release from the source (no guarantees)
-```bash
-pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.2-dev --upgrade
-```
-or nightly from testing PyPI
-```bash
-pip install -iU https://test.pypi.org/simple/ pytorch-lightning
-```
+  Install future release from the source (no guarantees)
+  ```bash
+  pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.2-dev --upgrade
+  ```
+  or nightly from testing PyPI
+  ```bash
+  pip install -iU https://test.pypi.org/simple/ pytorch-lightning
+  ```
+
+  <!-- end skipping PyPI description -->
+</details>
 
-<!-- end skipping PyPI description -->
 
 ### Step 1: Add these imports
 

From fcf894b621e6a0afcc39813c003d409387c32530 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 12:21:31 -0500
Subject: [PATCH 27/34] Update README.md

---
 README.md | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 020236da3b3d6..d2f3cc397e48b 100644
--- a/README.md
+++ b/README.md
@@ -191,17 +191,25 @@ trainer = pl.Trainer()
 trainer.fit(autoencoder, DataLoader(train), DataLoader(val))
 ```
 
-#### And without changing a single line of code, you could run on GPUs/TPUs
-```python
-# 8 GPUs
-trainer = Trainer(max_epochs=1, gpus=8)
-
-# 256 GPUs
-trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
+<details>
+  <summary>Train on GPUs without code changes</summary>
+  
+    ```python
+     # 8 GPUs
+     trainer = Trainer(max_epochs=1, gpus=8)
+
+     # 256 GPUs
+     trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
+     ```
+</details>
 
-# TPUs
-trainer = Trainer(tpu_cores=8)
-```
+<details>
+  <summary>Train on TPUs without code changes</summary>
+  
+    ```python
+    trainer = Trainer(tpu_cores=8)
+     ```
+</details>
 
 #### And even export for production via onnx or torchscript
 ```python

From 891fc64af7ddfdaa2a9ac43d2cfb5256151809ca Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 12:27:44 -0500
Subject: [PATCH 28/34] Update README.md

---
 README.md | 67 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index d2f3cc397e48b..7df2bfe035ef8 100644
--- a/README.md
+++ b/README.md
@@ -191,39 +191,62 @@ trainer = pl.Trainer()
 trainer.fit(autoencoder, DataLoader(train), DataLoader(val))
 ```
 
+### Advanced features
+Lightning has over [40+ advanced features](https://pytorch-lightning.readthedocs.io/en/stable/trainer.html#trainer-flags) designed for professional AI research at scale.
+
+Here are some examples:
+
+
 <details>
   <summary>Train on GPUs without code changes</summary>
   
-    ```python
-     # 8 GPUs
-     trainer = Trainer(max_epochs=1, gpus=8)
+  ```python
+   # 8 GPUs
+   trainer = Trainer(max_epochs=1, gpus=8)
 
-     # 256 GPUs
-     trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
-     ```
+   # 256 GPUs
+   trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
+   ```
 </details>
 
 <details>
   <summary>Train on TPUs without code changes</summary>
   
-    ```python
-    trainer = Trainer(tpu_cores=8)
-     ```
+  ```python
+  trainer = Trainer(tpu_cores=8)
+   ```
 </details>
 
-#### And even export for production via onnx or torchscript
-```python
-# torchscript
-autoencoder = LitAutoEncoder()
-torch.jit.save(autoencoder.to_torchscript(), "model.pt")
-
-# onnx
-with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
-    autoencoder = LitAutoEncoder()
-    input_sample = torch.randn((1, 64))
-    autoencoder.to_onnx(tmpfile.name, input_sample, export_params=True)
-    os.path.isfile(tmpfile.name)
-```
+<details>
+  <summary>16-bit precision</summary>
+  
+  ```python
+  trainer = Trainer(precision=16)
+   ```
+</details>
+
+<details>
+  <summary>Export to torchscript (JIT) (production use)</summary>
+  
+  ```python
+  # torchscript
+  autoencoder = LitAutoEncoder()
+  torch.jit.save(autoencoder.to_torchscript(), "model.pt")
+   ```
+</details>
+
+<details>
+  <summary>Export to ONNX (production use)</summary>
+  
+  ```python
+  # onnx
+  with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
+      autoencoder = LitAutoEncoder()
+      input_sample = torch.randn((1, 64))
+      autoencoder.to_onnx(tmpfile.name, input_sample, export_params=True)
+      os.path.isfile(tmpfile.name)
+   ```
+</details>
 
 #### For advanced users, you can still own complex training loops
 

From ecf995bad9e0a46e6b175ad1f155c4cfb0f75dfc Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 12:31:03 -0500
Subject: [PATCH 29/34] Update README.md

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7df2bfe035ef8..1e5fe5ebe94e1 100644
--- a/README.md
+++ b/README.md
@@ -191,7 +191,7 @@ trainer = pl.Trainer()
 trainer.fit(autoencoder, DataLoader(train), DataLoader(val))
 ```
 
-### Advanced features
+## Advanced features
 Lightning has over [40+ advanced features](https://pytorch-lightning.readthedocs.io/en/stable/trainer.html#trainer-flags) designed for professional AI research at scale.
 
 Here are some examples:
@@ -248,7 +248,8 @@ Here are some examples:
    ```
 </details>
 
-#### For advanced users, you can still own complex training loops
+### Pro-level control of training loops (advanced users)
+For complex/professional level work, you have optional full control of the training loop and optimizers.
 
 ```python
 class LitAutoEncoder(pl.LightningModule):

From e839d3bc2265a355094752b60b0cb06a7ebd5835 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 13:43:29 -0500
Subject: [PATCH 30/34] Update README.md

---
 README.md | 65 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 1e5fe5ebe94e1..d907087f77fbd 100644
--- a/README.md
+++ b/README.md
@@ -47,18 +47,19 @@ Lightning disentangles PyTorch code to decouple the science from the engineering
 
 ---
 
-## Lightning Philosophy
-Lightning is designed with these principles in mind:
+## Lightning Design Philosophy
+Lightning structures PyTorch code with these principles:
 
-Principle 1: Enable maximal flexibility.   
-Principle 2: Abstract away unnecessary boilerplate, but make it accessible when needed.   
-Principle 3: Systems should be self-contained (ie: optimizers, computation code, etc).   
-Principle 4: Deep learning code should be organized into 4 distinct categories.   
+<div align="center">
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/philosophies.jpg" max-height="250px">
+</div>
+
+Lightning forces the following structure to your code which makes it reusable and shareable:
 
-  - Research code (the LightningModule).
-  - Engineering code (you delete, and is handled by the Trainer).
-  - Non-essential research code (logging, etc... this goes in Callbacks).
-  - Data (use PyTorch Dataloaders or organize them into a LightningDataModule).
+- Research code (the LightningModule).
+- Engineering code (you delete, and is handled by the Trainer).
+- Non-essential research code (logging, etc... this goes in Callbacks).
+- Data (use PyTorch Dataloaders or organize them into a LightningDataModule).
 
 Once you do this, you can train on multiple-GPUs, TPUs, CPUs and even in 16-bit precision without changing your code!
 
@@ -66,29 +67,29 @@ Get started with our [2 step guide](https://pytorch-lightning.readthedocs.io/en/
 
 ---
 
-## Inference
-Lightning is also designed for the fast inference AI researchers and production teams need to scale up things like BERT and self-supervised learning.
-Lightning can automatically export to ONNX or TorchScript for those cases.
-
----
-
 ## Continuous Integration
-<center>
-
-| System / PyTorch ver. | 1.4 (min. req.)* | 1.5 | 1.6 | 1.7 (latest) | 1.8 (nightly) |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
-| Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PyTorchLightning.pytorch-lightning?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=2&branchName=master) | - | - |
-| Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) |
-| Linux py3.{6,7} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
-| OSX py3.{6,7,8} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
-| Windows py3.{6,7,8} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
-
-- _\** tests run on two NVIDIA K80_
-- _\*** tests run on Google GKE TPUv2/3_
-- _TPU w/ py3.6/py3.7 means we support Colab and Kaggle env._
-
-</center>
+Lightning is rigurously tested across multiple GPUs, TPUs CPUs and against major Python and PyTorch versions.
+
+<details>
+  <summary>Current build statuses</summary>
+  
+  <center>
+
+  | System / PyTorch ver. | 1.4 (min. req.)* | 1.5 | 1.6 | 1.7 (latest) | 1.8 (nightly) |
+  | :---: | :---: | :---: | :---: | :---: | :---: |
+  | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
+  | Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PyTorchLightning.pytorch-lightning?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=2&branchName=master) | - | - |
+  | Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) |
+  | Linux py3.{6,7} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+  | OSX py3.{6,7,8} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+  | Windows py3.{6,7,8} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+
+  - _\** tests run on two NVIDIA K80_
+  - _\*** tests run on Google GKE TPUv2/3_
+  - _TPU w/ py3.6/py3.7 means we support Colab and Kaggle env._
+
+  </center>
+</details>
 
 ---
 

From 11942558d0bbab5569d7c8d80a5f80538dfdb1de Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 13:50:18 -0500
Subject: [PATCH 31/34] Update README.md

---
 README.md | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index d907087f77fbd..f70db5d430154 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,19 @@ Lightning is rigurously tested across multiple GPUs, TPUs CPUs and against major
   </center>
 </details>
 
+<details>
+  <summary>Bleeding edge build status (1.2)</summary>
+  
+  <center>
+  
+  ![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push)
+  ![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push)
+  </center>
+</details>
+
 ---
 
 ## How To Use
@@ -101,26 +114,29 @@ Simple installation from PyPI
 ```bash
 pip install pytorch-lightning
 ```
-_To get full package experience you can install also all optional dependencies with `pytorch-lightning['extra']` or for CPU users with `pytorch-lightning['cpu-extra']`._
-
-From Conda
-```bash
-conda install pytorch-lightning -c conda-forge
-```
 
 <details>
-  <summary>Other options</summary>
+  <summary>Other installation options</summary>
     <!-- following section will be skipped from PyPI description -->
+  
+  #### Install with optional dependencies (CPU)
+  
+  ```bash
+  pip install pytorch-lightning['cpu-extra']
+  ```
 
-  #### Install bleeding-edge - future 1.2
-
-  the actual status of 1.2 [nightly] is following:
+  #### Install with optional dependencies (GPU, TPU)
+  
+  ```bash
+  pip install pytorch-lightning['extra']
+  ```
+  
+  #### Conda
+  ```bash
+  conda install pytorch-lightning -c conda-forge
+  ```
 
-  ![CI base testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20base%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
-  ![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.2-dev&event=push)
-  ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.2-dev&event=push)
-  ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.2-dev&event=push)
-  ![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.2-dev&event=push)
+  #### Install bleeding-edge - future 1.2
 
   Install future release from the source (no guarantees)
   ```bash

From d924dd6a41eee534f4138c2f21585a0d6952f995 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 14:01:34 -0500
Subject: [PATCH 32/34] Update README.md

---
 README.md | 76 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index f70db5d430154..e09c1f25d1ba8 100644
--- a/README.md
+++ b/README.md
@@ -218,18 +218,20 @@ Here are some examples:
   <summary>Train on GPUs without code changes</summary>
   
   ```python
-   # 8 GPUs
-   trainer = Trainer(max_epochs=1, gpus=8)
+  # 8 GPUs
+  # no code changes needed
+  trainer = Trainer(max_epochs=1, gpus=8)
 
-   # 256 GPUs
-   trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
-   ```
+  # 256 GPUs
+  trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
+  ```
 </details>
 
 <details>
   <summary>Train on TPUs without code changes</summary>
   
   ```python
+  # no code changes needed
   trainer = Trainer(tpu_cores=8)
    ```
 </details>
@@ -238,10 +240,54 @@ Here are some examples:
   <summary>16-bit precision</summary>
   
   ```python
+  # no code changes needed
   trainer = Trainer(precision=16)
    ```
 </details>
 
+<details>
+  <summary>Experiment managers</summary>
+  
+  ```python
+  from pytorch_lightning import loggers
+  
+  # tensorboard
+  trainer = Trainer(logger=TensorBoardLogger('logs/'))
+  
+  # weights and biases
+  trainer = Trainer(logger=loggers.WandbLogger())
+  
+  # comet
+  trainer = Trainer(logger=loggers.CometLogger())
+  
+  # mlflow
+  trainer = Trainer(logger=loggers.MLFlowLogger())
+  
+  # neptune
+  trainer = Trainer(logger=loggers.NeptuneLogger())
+  
+  # ... and dozens more
+   ```
+</details>
+
+<details>
+  <summary>EarlyStopping</summary>
+  
+  ```python
+  es = EarlyStopping(monitor='val_loss')
+  trainer = Trainer(callbacks=[es])
+   ```
+</details>
+
+<details>
+  <summary>Checkpointing</summary>
+  
+  ```python
+  checkpointing = ModelCheckpoint(monitor='val_loss')
+  trainer = Trainer(callbacks=[checkpointing])
+   ```
+</details>
+
 <details>
   <summary>Export to torchscript (JIT) (production use)</summary>
   
@@ -287,27 +333,17 @@ class LitAutoEncoder(pl.LightningModule):
 ```
 ---
 
-## Key Features
+## Advantages over unstructured PyTorch
 
-* Scale your models to run on any hardware (CPU, GPUs, TPUs) without changing your model
-* Making code more readable by decoupling the research code from the engineering
+* Models become hardware agnostic
+* Code is clear to read because engineering code is abstracted away
 * Easier to reproduce
-* Less error prone by automating most of the training loop and tricky engineering
+* Make fewer mistakes because lightning handles the tricky engineering
 * Keeps all the flexibility (LightningModules are still PyTorch modules), but removes a ton of boilerplate
-* Lightning has out-of-the-box integration with the popular logging/visualizing frameworks ([Tensorboard](https://pytorch.org/docs/stable/tensorboard.html), [MLFlow](https://mlflow.org/), [Neptune.ai](https://neptune.ai/), [Comet.ml](https://www.comet.ml/site/), [Wandb](https://www.wandb.com/)).
+* Lightning has dozens of integrations with popular machine learning tools.
 * [Tested rigorously with every new PR](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/tests). We test every combination of PyTorch and Python supported versions, every OS, multi GPUs and even TPUs.
 * Minimal running speed overhead (about 300 ms per epoch compared with pure PyTorch).
 
-### Lightning automates 40+ parts of DL/ML research
-- GPU training
-- Distributed GPU (cluster) training
-- TPU training
-- EarlyStopping
-- Logging/Visualizing
-- Checkpointing
-- Experiment management
-- [Full list here](https://pytorch-lightning.readthedocs.io/en/latest/#common-use-cases)
-
 ---
 
 ## Examples

From 194f04826388baf90ff757e2b3c26ba2e770b9ab Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 14:41:38 -0500
Subject: [PATCH 33/34] Update README.md

---
 README.md | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index e09c1f25d1ba8..cdb0b402ff713 100644
--- a/README.md
+++ b/README.md
@@ -213,9 +213,12 @@ Lightning has over [40+ advanced features](https://pytorch-lightning.readthedocs
 
 Here are some examples:
 
+<div align="center">
+  <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/features_2.jpg" max-height="600px">
+</div>
 
 <details>
-  <summary>Train on GPUs without code changes</summary>
+  <summary>Highlighted feature code snippets</summary>
   
   ```python
   # 8 GPUs
@@ -225,27 +228,21 @@ Here are some examples:
   # 256 GPUs
   trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32)
   ```
-</details>
 
-<details>
   <summary>Train on TPUs without code changes</summary>
   
   ```python
   # no code changes needed
   trainer = Trainer(tpu_cores=8)
    ```
-</details>
 
-<details>
   <summary>16-bit precision</summary>
   
   ```python
   # no code changes needed
   trainer = Trainer(precision=16)
    ```
-</details>
 
-<details>
   <summary>Experiment managers</summary>
   
   ```python
@@ -268,27 +265,21 @@ Here are some examples:
   
   # ... and dozens more
    ```
-</details>
 
-<details>
   <summary>EarlyStopping</summary>
   
   ```python
   es = EarlyStopping(monitor='val_loss')
   trainer = Trainer(callbacks=[es])
    ```
-</details>
 
-<details>
   <summary>Checkpointing</summary>
   
   ```python
   checkpointing = ModelCheckpoint(monitor='val_loss')
   trainer = Trainer(callbacks=[checkpointing])
    ```
-</details>
 
-<details>
   <summary>Export to torchscript (JIT) (production use)</summary>
   
   ```python
@@ -296,9 +287,7 @@ Here are some examples:
   autoencoder = LitAutoEncoder()
   torch.jit.save(autoencoder.to_torchscript(), "model.pt")
    ```
-</details>
 
-<details>
   <summary>Export to ONNX (production use)</summary>
   
   ```python

From 0345fcfaadc3bc4a2948369c379d2ace241d616e Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 13 Feb 2021 14:44:19 -0500
Subject: [PATCH 34/34] Update README.md

---
 README.md | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/README.md b/README.md
index cdb0b402ff713..1f214f4a2b476 100644
--- a/README.md
+++ b/README.md
@@ -382,15 +382,7 @@ If you have any questions please:
 4. [Join our slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A).
 
 ### Funding
-Building open-source software with only a few part-time people is hard!
-
-[We're venture funded](https://techcrunch.com/2020/10/08/grid-ai-raises-18-6m-series-a-to-help-ai-researchers-and-engineers-bring-their-models-to-production/)
-and backed by some of the top VC funds in the world, [Index Ventures](https://www.indexventures.com/companies/), [Bain Capital Ventures](https://www.baincapitalventures.com/portfolio/), [First Minute Capital](https://firstminute.capital/companies).
-
-Their funding ensures we can continue to build awesome tooling like Grid, give you around the clock support,
-hire a full-time staff, attend conferences, and move faster through implementing features you request.
-
-To supercharge your research and production work, visit our [Grid.ai platform](https://www.grid.ai/)
+[We're venture funded](https://techcrunch.com/2020/10/08/grid-ai-raises-18-6m-series-a-to-help-ai-researchers-and-engineers-bring-their-models-to-production/) to make sure we can provide around the clock support, hire a full-time staff, attend conferences, and move faster through implementing features you request.
 
 ---