From 2d3f617aa5f5d1e33e6da24242c621205b29e149 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 10 Mar 2021 17:45:03 +0000
Subject: [PATCH 01/60] Add context to call hook to handle all modules defined
 within the hook

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index b54155d60eae5..eb9d7aba9773e 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -233,6 +233,9 @@ def _init_scheduler_optimizer(self):
     def _initialize_deepspeed_train(self, model):
         if self.on_gpu:
             torch.cuda.set_device(self.root_device)
+        with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
+            self.lightning_module.trainer.call_hook("on_model_parallel_setup")
+
         optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
         if "optimizer" not in self.config:
             rank_zero_info(

From 99495e8ac113eaeb36930a77da1f72f1d9e3e1bd Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 10 Mar 2021 23:57:35 +0000
Subject: [PATCH 02/60] Expose some additional parameters

---
 pytorch_lightning/core/hooks.py               |  7 +++
 .../plugins/training_type/deepspeed.py        | 52 ++++++++++++++++---
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 9826f9d44ac2c..86480d8c22598 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -334,6 +334,13 @@ def on_post_move_to_device(self):
 
         """
 
+    def on_model_parallel_setup(self) -> None:
+        """
+
+        Returns:
+
+        """
+
 
 class DataHooks:
     """Hooks to be used for data related stuff."""
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index eb9d7aba9773e..cf35ac724d059 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -83,7 +83,12 @@ def __init__(
         initial_scale_power: int = 32,
         loss_scale_window: int = 1000,
         hysteresis: int = 2,
-        min_loss_scale: int = 1
+        min_loss_scale: int = 1,
+        activation_checkpointing: bool = False,
+        partition_activations: bool = False,
+        cpu_checkpointing: bool = False,
+        contiguous_memory_optimization: bool = False,
+        synchronize_checkpoint_boundary: bool = False,
     ) -> None:
         """
 
@@ -159,6 +164,11 @@ def __init__(
             self.config = self._create_default_config(
                 zero_optimization,
                 zero_allow_untested_optimizer,
+                activation_checkpointing=activation_checkpointing,
+                partition_activations=partition_activations,
+                cpu_checkpointing=cpu_checkpointing,
+                contiguous_memory_optimization=contiguous_memory_optimization,
+                synchronize_checkpoint_boundary=synchronize_checkpoint_boundary,
                 stage=stage,
                 cpu_offload=cpu_offload,
                 contiguous_gradients=contiguous_gradients,
@@ -230,11 +240,17 @@ def _init_scheduler_optimizer(self):
         optimizer = optimizers[0]
         return optimizer, scheduler, optimizer_frequencies
 
+    @property
+    def zero_stage_3(self) -> bool:
+        return self.config.get('zero_optimization') and self.config.get('zero_optimization').get('stage') == 3
+
     def _initialize_deepspeed_train(self, model):
         if self.on_gpu:
             torch.cuda.set_device(self.root_device)
-        with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
-            self.lightning_module.trainer.call_hook("on_model_parallel_setup")
+
+        if self.zero_stage_3:
+            with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
+                self.lightning_module.trainer.call_hook("on_model_parallel_setup")
 
         optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
         if "optimizer" not in self.config:
@@ -252,11 +268,22 @@ def _initialize_deepspeed_train(self, model):
             lr_scheduler=lightning_scheduler,
             config_params=self.config,
         )
+        self._set_deepspeed_activation_checkpointing()
 
         # set optimizer for save/load, but deepspeed manages the specific optimizer logic
         self.lightning_module.trainer.optimizers = [optimizer]
         self.model = model
 
+    def _set_deepspeed_activation_checkpointing(self):
+        checkpoint_config = self.config.get('activation_checkpointing', {})
+        deepspeed.checkpointing.configure(
+            mpu_=None,
+            partition_activations=checkpoint_config.get('partition_activations'),
+            contiguous_checkpointing=checkpoint_config.get('contiguous_checkpointing'),
+            checkpoint_in_cpu=checkpoint_config.get('checkpoint_in_cpu'),
+            profile=checkpoint_config.get('profile'),
+        )
+
     def _initialize_deepspeed_inference(self, model):
         # move the model to the correct device
         self.model_to_device()
@@ -346,8 +373,21 @@ def _format_precision_config(self):
             raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.")
 
     def _create_default_config(
-        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, **zero_kwargs
+        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, activation_checkpointing: bool,
+        partition_activations: bool, cpu_checkpointing: bool, contiguous_memory_optimization: bool,
+        synchronize_checkpoint_boundary: bool, **zero_kwargs
     ) -> Dict:
+        cfg = {}
         if zero_optimization:
-            return {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs}
-        return {}
+            cfg = {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs}
+        if activation_checkpointing:
+            cfg = {
+                'activation_checkpointing': {
+                    "partition_activations": partition_activations,
+                    "cpu_checkpointing": cpu_checkpointing,
+                    "contiguous_memory_optimization": contiguous_memory_optimization,
+                    "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary
+                },
+                **cfg
+            }
+        return cfg

From c3aac675ddf8fbd01a0cbf9f0cd6e2499cd50480 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 11 Mar 2021 13:57:24 +0000
Subject: [PATCH 03/60] Added docs, exposed parameters

---
 .../plugins/training_type/deepspeed.py        | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index cf35ac724d059..addbc1fb744c9 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -67,6 +67,8 @@ def __init__(
         zero_optimization: bool = True,
         stage: int = 2,
         cpu_offload: bool = False,
+        cpu_offload_params: bool = False,
+        cpu_offload_use_pin_memory: bool = False,
         contiguous_gradients: bool = True,
         overlap_comm: bool = True,
         allgather_partitions: bool = True,
@@ -111,6 +113,10 @@ def __init__(
 
             cpu_offload: Enable offloading optimizer memory and computation to CPU
 
+            cpu_offload_params: When using ZeRO stage 3, offload parameters to CPU
+
+            cpu_offload_use_pin_memory: When using ZeRO stage 3, pin memory on CPU
+
             contiguous_gradients: Copies gradients to a continuous buffer as they are produced.
                 Avoids memory fragmentation during backwards. Useful when training large models. (default: True)
 
@@ -149,6 +155,18 @@ def __init__(
 
             min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000)
 
+            activation_checkpointing: Enable activation checkpointing. This allows DeepSpeed to setup global variables
+                however still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint.
+                See https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional
+
+            partition_activations: Enables partition activation when used with ZeRO stage 3
+
+            cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled
+
+            contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory.
+                Not supported by all models
+
+            synchronize_checkpoint_boundary: Insert ``torch.cuda.synchronize()`` at each checkpoint boundary.
         """
         if not _DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
@@ -171,6 +189,8 @@ def __init__(
                 synchronize_checkpoint_boundary=synchronize_checkpoint_boundary,
                 stage=stage,
                 cpu_offload=cpu_offload,
+                cpu_offload_params=cpu_offload_params,
+                cpu_offload_use_pin_memory=cpu_offload_use_pin_memory,
                 contiguous_gradients=contiguous_gradients,
                 overlap_comm=overlap_comm,
                 allgather_partitions=allgather_partitions,

From 340f817406d0f51e052f7ad7884e639c5d8f0a61 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 11 Mar 2021 17:46:39 +0000
Subject: [PATCH 04/60] Make sure we only configure if necessary

---
 .../plugins/training_type/deepspeed.py          | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index addbc1fb744c9..0e3bc4f16cde8 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -295,14 +295,15 @@ def _initialize_deepspeed_train(self, model):
         self.model = model
 
     def _set_deepspeed_activation_checkpointing(self):
-        checkpoint_config = self.config.get('activation_checkpointing', {})
-        deepspeed.checkpointing.configure(
-            mpu_=None,
-            partition_activations=checkpoint_config.get('partition_activations'),
-            contiguous_checkpointing=checkpoint_config.get('contiguous_checkpointing'),
-            checkpoint_in_cpu=checkpoint_config.get('checkpoint_in_cpu'),
-            profile=checkpoint_config.get('profile'),
-        )
+        if self.config.get('activation_checkpointing'):
+            checkpoint_config = self.config['activation_checkpointing']
+            deepspeed.checkpointing.configure(
+                mpu_=None,
+                partition_activations=checkpoint_config.get('partition_activations'),
+                contiguous_checkpointing=checkpoint_config.get('contiguous_checkpointing'),
+                checkpoint_in_cpu=checkpoint_config.get('checkpoint_in_cpu'),
+                profile=checkpoint_config.get('profile'),
+            )
 
     def _initialize_deepspeed_inference(self, model):
         # move the model to the correct device

From f192afc1f6b824ec0339c4c2b68c907bc01d6123 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 12 Mar 2021 00:29:37 +0000
Subject: [PATCH 05/60] Setup activation checkpointing regardless, saves the
 user having to do it manually

---
 .../plugins/training_type/deepspeed.py        | 33 ++++++++-----------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 0e3bc4f16cde8..3eb14ff2959e6 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -86,7 +86,6 @@ def __init__(
         loss_scale_window: int = 1000,
         hysteresis: int = 2,
         min_loss_scale: int = 1,
-        activation_checkpointing: bool = False,
         partition_activations: bool = False,
         cpu_checkpointing: bool = False,
         contiguous_memory_optimization: bool = False,
@@ -155,12 +154,10 @@ def __init__(
 
             min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000)
 
-            activation_checkpointing: Enable activation checkpointing. This allows DeepSpeed to setup global variables
-                however still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint.
+            partition_activations: Enables partition activation when used with ZeRO stage 3.
+                Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint.
                 See https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional
 
-            partition_activations: Enables partition activation when used with ZeRO stage 3
-
             cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled
 
             contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory.
@@ -182,7 +179,6 @@ def __init__(
             self.config = self._create_default_config(
                 zero_optimization,
                 zero_allow_untested_optimizer,
-                activation_checkpointing=activation_checkpointing,
                 partition_activations=partition_activations,
                 cpu_checkpointing=cpu_checkpointing,
                 contiguous_memory_optimization=contiguous_memory_optimization,
@@ -394,21 +390,18 @@ def _format_precision_config(self):
             raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.")
 
     def _create_default_config(
-        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, activation_checkpointing: bool,
-        partition_activations: bool, cpu_checkpointing: bool, contiguous_memory_optimization: bool,
-        synchronize_checkpoint_boundary: bool, **zero_kwargs
+        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool,
+        cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool,
+        **zero_kwargs
     ) -> Dict:
-        cfg = {}
+        cfg = {
+            'activation_checkpointing': {
+                "partition_activations": partition_activations,
+                "cpu_checkpointing": cpu_checkpointing,
+                "contiguous_memory_optimization": contiguous_memory_optimization,
+                "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary
+            }
+        }
         if zero_optimization:
             cfg = {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs}
-        if activation_checkpointing:
-            cfg = {
-                'activation_checkpointing': {
-                    "partition_activations": partition_activations,
-                    "cpu_checkpointing": cpu_checkpointing,
-                    "contiguous_memory_optimization": contiguous_memory_optimization,
-                    "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary
-                },
-                **cfg
-            }
         return cfg

From a2784a479996ba444b603ffe88752fae0ddb917a Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Fri, 12 Mar 2021 23:43:10 +0000
Subject: [PATCH 06/60] Add some tests that fail currently

---
 .../plugins/training_type/deepspeed.py        | 43 +++++++------
 tests/plugins/test_deepspeed_plugin.py        | 63 ++++++++++++++++++-
 2 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 3eb14ff2959e6..612a92edbc97d 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -20,7 +20,6 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from torch.nn.parallel import DistributedDataParallel
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
@@ -239,6 +238,11 @@ def init_deepspeed(self):
         precision = self.lightning_module.trainer.accelerator.precision
         model = LightningDeepSpeedModule(pl_module=self.model, precision=precision)
 
+        if self.on_gpu:
+            torch.cuda.set_device(self.root_device)
+
+        self._call_model_parallel_setup()
+
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)
         else:
@@ -261,13 +265,6 @@ def zero_stage_3(self) -> bool:
         return self.config.get('zero_optimization') and self.config.get('zero_optimization').get('stage') == 3
 
     def _initialize_deepspeed_train(self, model):
-        if self.on_gpu:
-            torch.cuda.set_device(self.root_device)
-
-        if self.zero_stage_3:
-            with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
-                self.lightning_module.trainer.call_hook("on_model_parallel_setup")
-
         optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
         if "optimizer" not in self.config:
             rank_zero_info(
@@ -290,6 +287,11 @@ def _initialize_deepspeed_train(self, model):
         self.lightning_module.trainer.optimizers = [optimizer]
         self.model = model
 
+    def _call_model_parallel_setup(self):
+        if self.zero_stage_3:
+            with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
+                self.lightning_module.trainer.call_hook("on_model_parallel_setup")
+
     def _set_deepspeed_activation_checkpointing(self):
         if self.config.get('activation_checkpointing'):
             checkpoint_config = self.config['activation_checkpointing']
@@ -302,15 +304,16 @@ def _set_deepspeed_activation_checkpointing(self):
             )
 
     def _initialize_deepspeed_inference(self, model):
-        # move the model to the correct device
-        self.model_to_device()
-
-        self.pre_configure_ddp()
-        self.model = DistributedDataParallel(
-            model,
-            device_ids=self.determine_ddp_device_ids(),
-            **self._ddp_kwargs,
+        inference_config = {
+            'train_micro_batch_size_per_gpu': 1,
+            'fp16': self.config['fp16'],
+        }
+        model, _, _, _ = deepspeed.initialize(
+            args=SimpleNamespace(local_rank=self.local_rank),
+            model=model,
+            config_params=inference_config,
         )
+        self.model = model
 
     def configure_scheduler(self, lr_scheduler):
         scheduler = _get_default_scheduler_config()
@@ -357,7 +360,7 @@ def _format_batch_size_and_grad_accum_config(self):
         if "train_micro_batch_size_per_gpu" not in self.config:
             # train_micro_batch_size_per_gpu is used for throughput logging purposes
             # by default we use the batch size of the loader which may be incorrect if a batch sampler is passed
-            batch_size = self.lightning_module.train_dataloader().batch_size
+            batch_size = self.lightning_module.train_dataloader().batch_sampler.batch_size
             self.config["train_micro_batch_size_per_gpu"] = batch_size
         self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches
         if "gradient_clipping" not in self.config:
@@ -403,5 +406,9 @@ def _create_default_config(
             }
         }
         if zero_optimization:
-            cfg = {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs}
+            cfg = {
+                "zero_allow_untested_optimizer": zero_allow_untested_optimizer,
+                "zero_optimization": zero_kwargs,
+                **cfg
+            }
         return cfg
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index cf5c23a824732..81b38d85c9c10 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -301,6 +301,22 @@ def on_train_start(self) -> None:
         trainer.fit(model)
 
 
+@RunIf(deepspeed=True)
+def test_deepspeed_custom_activation_checkpointing_params(tmpdir):
+    """Ensure if we modify the activation checkpointing parameters, the deepspeed config contains these changes."""
+    ds = DeepSpeedPlugin(
+        partition_activations=True,
+        cpu_checkpointing=True,
+        contiguous_memory_optimization=True,
+        synchronize_checkpoint_boundary=True
+    )
+    checkpoint_config = ds.config['activation_checkpointing']
+    assert checkpoint_config['partition_activations']
+    assert checkpoint_config['cpu_checkpointing']
+    assert checkpoint_config['contiguous_memory_optimization']
+    assert checkpoint_config['synchronize_checkpoint_boundary']
+
+
 @RunIf(min_gpus=1, deepspeed=True)
 def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
     """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config."""
@@ -324,7 +340,7 @@ def on_train_start(self) -> None:
         trainer.fit(model)
 
 
-@RunIf(min_gpus=2, special=True, deepspeed=True)
+@RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu(tmpdir, deepspeed_config):
     """
         Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
@@ -343,6 +359,51 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
+class ModelParallelBoringModel(BoringModel):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = None
+
+    def on_model_parallel_setup(self) -> None:
+        self.linear = torch.nn.Linear(32, 2)
+
+
+@RunIf(min_gpus=2, deepspeed=True)
+def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
+    """
+        Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
+    """
+    model = ModelParallelBoringModel()
+    trainer = Trainer(
+        plugins=[DeepSpeedPlugin(stage=3)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        fast_dev_run=True,
+        precision=16,
+    )
+    trainer.fit(model)
+    trainer.test(model)
+
+    _assert_save_model_is_equal(model, tmpdir, trainer)
+
+
+@RunIf(min_gpus=2, deepspeed=True)
+def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
+    """
+        Test to ensure we can use DeepSpeed with just test.
+    """
+    model = ModelParallelBoringModel()
+    trainer = Trainer(
+        plugins=[DeepSpeedPlugin(stage=3)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        fast_dev_run=True,
+        precision=16,
+    )
+    trainer.test(model)
+
+
 def _assert_save_model_is_equal(model, tmpdir, trainer):
     checkpoint_path = os.path.join(tmpdir, 'model.pt')
     trainer.save_checkpoint(checkpoint_path)

From b0dab3d3f95b6482a013477ca21f745c327a0da8 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 15 Mar 2021 19:42:24 +0000
Subject: [PATCH 07/60] update

---
 .../plugins/training_type/deepspeed.py        | 29 ++++++++++++++++++-
 .../training_type/training_type_plugin.py     | 12 ++++++++
 .../connectors/checkpoint_connector.py        | 17 +++++------
 pytorch_lightning/trainer/trainer.py          |  3 +-
 4 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 612a92edbc97d..994d4218ff2fd 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -222,6 +222,8 @@ def pre_dispatch(self):
 
         self.init_deepspeed()
 
+        self.lightning_module.trainer.save_checkpoint = self.save_checkpoint
+
         # set warning rank
         rank_zero_only.rank = self.global_rank
 
@@ -367,7 +369,6 @@ def _format_batch_size_and_grad_accum_config(self):
             self.config["gradient_clipping"] = self.lightning_module.trainer.gradient_clip_val
 
     def _format_precision_config(self):
-
         amp_type = self.lightning_module.trainer.accelerator_connector.amp_type
         amp_level = self.lightning_module.trainer.accelerator_connector.amp_level
         precision = self.lightning_module.trainer.accelerator_connector.precision
@@ -412,3 +413,29 @@ def _create_default_config(
                 **cfg
             }
         return cfg
+
+    def _filepath_to_dir(self, filepath: str):
+        return filepath.split('.')[0]
+
+    def save_checkpoint(self, filepath: str, weights_only: bool = False):
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            filepath: write-target file's path
+            weights_only: saving model weights only
+        """
+        # dump states as a checkpoint dictionary object
+        _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
+        save_dir = self._filepath_to_dir(filepath)
+        _exclude_keys = []# ['optimizer_states', 'lr_schedulers']
+        _checkpoint = {k:v for k, v in _checkpoint.items() if k not in _exclude_keys}
+        self.model.save_checkpoint(save_dir, client_state=_checkpoint)
+
+    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage):
+        if torch.distributed.is_available():
+            from pytorch_lightning.trainer.states import TrainerState
+            _load_optimization = self.lightning_module.trainer.state == TrainerState.FITTING
+            save_dir = self._filepath_to_dir(ckpt_path)
+            self.model.optimizer._partition_all_parameters() 
+            _, client_state = self.model.load_checkpoint(save_dir, load_optimizer_states=_load_optimization, load_lr_scheduler_states=_load_optimization)
+            return client_state, False
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 7783f066dbc61..603cf64e8d7b5 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union
+from pytorch_lightning.utilities.cloud_io import load as pl_load
 
 import torch
 from torch.nn import Module
@@ -169,3 +170,14 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule):
 
     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
         optimizer.step(closure=lambda_closure, **kwargs)
+
+    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Dict:
+        ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
+        # restore datamodule states
+        if self.lightning_module.trainer.datamodule is not None:
+            self.lightning_module.trainer.datamodule.on_load_checkpoint(ckpt)
+
+        # hook: give user access to checkpoint if needed.
+        self.lightning_module.on_load_checkpoint(ckpt)
+        self.lightning_module.load_state_dict(ckpt['state_dict'])
+        return ckpt, True
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 60c76b70bba50..987d2dec65f13 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -90,20 +90,16 @@ def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
             rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch")
             return False
 
-        # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
-        checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
+        checkpoint, load_optimizer_states = self.trainer.training_type_plugin.restore_model_state_from_ckpt_path(
+            checkpoint_path, map_location=lambda storage, loc: storage)
 
-        # acquire the model
         model = self.trainer.lightning_module
 
-        # restore model and datamodule state
-        self.restore_model_state(model, checkpoint)
-
         if on_gpu:
             model.cuda(self.trainer.root_gpu)
 
         # restore training state
-        self.restore_training_state(checkpoint)
+        self.restore_training_state(checkpoint, load_optimizer_states)
 
         rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}")
         return True
@@ -123,7 +119,7 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         # restore model state_dict
         model.load_state_dict(checkpoint['state_dict'])
 
-    def restore_training_state(self, checkpoint):
+    def restore_training_state(self, checkpoint, load_optimizer_states: bool):
         """
         Restore trainer state.
         Model will get its change to update
@@ -131,7 +127,7 @@ def restore_training_state(self, checkpoint):
         :return:
         """
         # validation
-        if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint:
+        if load_optimizer_states and ('optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint):
             raise KeyError(
                 'Trying to restore training state but checkpoint contains only the model.'
                 ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.'
@@ -177,6 +173,9 @@ def restore_training_state(self, checkpoint):
                 " consider using an end of epoch checkpoint."
             )
 
+        if not load_optimizer_states:
+            return
+
         # restore the optimizers
         optimizer_states = checkpoint['optimizer_states']
         for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index c3039d24aadc0..8ebab2e110aca 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -971,8 +971,7 @@ def __load_ckpt_weights(
 
             self.training_type_plugin.barrier()
 
-            ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt['state_dict'])
+            self.training_type_plugin.restore_model_state_from_ckpt_path(ckpt_path, map_location=lambda storage, loc: storage)
         return ckpt_path
 
     def predict(

From 0c44f0585a260ae684a47c26b6ebaa8f5afcb200 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 15 Mar 2021 19:58:03 +0000
Subject: [PATCH 08/60] update

---
 .../plugins/training_type/deepspeed.py         | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 994d4218ff2fd..b15975019db2d 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -425,17 +425,21 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False):
             weights_only: saving model weights only
         """
         # dump states as a checkpoint dictionary object
-        _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
+        client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
         save_dir = self._filepath_to_dir(filepath)
-        _exclude_keys = []# ['optimizer_states', 'lr_schedulers']
-        _checkpoint = {k:v for k, v in _checkpoint.items() if k not in _exclude_keys}
-        self.model.save_checkpoint(save_dir, client_state=_checkpoint)
+        _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers']
+        client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys}
+        self.model.save_checkpoint(save_dir, client_state=client_state)
 
     def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage):
         if torch.distributed.is_available():
-            from pytorch_lightning.trainer.states import TrainerState
-            _load_optimization = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)
             self.model.optimizer._partition_all_parameters() 
-            _, client_state = self.model.load_checkpoint(save_dir, load_optimizer_states=_load_optimization, load_lr_scheduler_states=_load_optimization)
+            _, client_state = self.model.load_checkpoint(save_dir)
+
+            if self.lightning_module.trainer.datamodule is not None:
+                self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state)
+
+            # hook: give user access to checkpoint if needed.
+            self.lightning_module.on_load_checkpoint(client_state)
             return client_state, False
\ No newline at end of file

From 26655d7f032b2680b8a35c0b4a159c8224af51a3 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 15 Mar 2021 20:02:39 +0000
Subject: [PATCH 09/60] update

---
 pytorch_lightning/plugins/training_type/deepspeed.py  | 11 ++++++++---
 .../plugins/training_type/training_type_plugin.py     |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index b15975019db2d..be6df07ba22b3 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -417,7 +417,7 @@ def _create_default_config(
     def _filepath_to_dir(self, filepath: str):
         return filepath.split('.')[0]
 
-    def save_checkpoint(self, filepath: str, weights_only: bool = False):
+    def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
@@ -431,12 +431,17 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False):
         client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys}
         self.model.save_checkpoint(save_dir, client_state=client_state)
 
-    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage):
+    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
         if torch.distributed.is_available():
+            from pytorch_lightning.trainer.states import TrainerState
+            load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)
             self.model.optimizer._partition_all_parameters() 
-            _, client_state = self.model.load_checkpoint(save_dir)
 
+            _, client_state = self.model.load_checkpoint(
+                save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states)
+            
+            # restore datamodule states
             if self.lightning_module.trainer.datamodule is not None:
                 self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state)
 
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 603cf64e8d7b5..9e2de5ce61ab0 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union
+from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union, Tuple
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 
 import torch
@@ -171,7 +171,7 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule):
     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
         optimizer.step(closure=lambda_closure, **kwargs)
 
-    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Dict:
+    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
         ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
         # restore datamodule states
         if self.lightning_module.trainer.datamodule is not None:

From ac19f369172e179404814b782264b3e553bf90bb Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 16 Mar 2021 10:16:18 +0000
Subject: [PATCH 10/60] add tests

---
 tests/helpers/simple_models.py         |  6 +--
 tests/plugins/test_deepspeed_plugin.py | 55 +++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index 1abeb1f00206a..a35ab2fc4a2fe 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -54,20 +54,20 @@ def training_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
         self.log('train_loss', loss, prog_bar=True)
-        self.log('train_acc', self.train_acc(logits, y), prog_bar=True)
+        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('val_acc', self.valid_acc(logits, y), prog_bar=True)
+        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('test_acc', self.test_acc(logits, y), prog_bar=True)
+        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 81b38d85c9c10..7a8e226e3ca56 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,16 +1,20 @@
 import json
 import os
+from pytorch_lightning.core import datamodule
 
 import pytest
 import torch
 from torch import Tensor
 from torch.optim import Optimizer
-
+from torch import nn
+from pytorch_lightning.metrics import Accuracy 
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 from tests.helpers.runif import RunIf
 
 
@@ -369,6 +373,24 @@ def on_model_parallel_setup(self) -> None:
         self.linear = torch.nn.Linear(32, 2)
 
 
+class ModelParallelClassificationModel(ClassificationModel):
+
+    def __init__(self, lr=0.01):
+        super().__init__()
+
+        self.lr = lr
+        self.train_acc = Accuracy()
+        self.valid_acc = Accuracy()
+        self.test_acc = Accuracy()
+
+    def on_model_parallel_setup(self) -> None:
+        for i in range(3):
+            setattr(self, f"layer_{i}", nn.Linear(32, 32))
+            setattr(self, f"layer_{i}a", nn.ReLU())
+        setattr(self, "layer_end", nn.Linear(32, 3))
+
+
+
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     """
@@ -388,6 +410,37 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
+@pytest.mark.skipif("Currently failing")
+@RunIf(min_gpus=2, deepspeed=True)
+def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
+    """
+        Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
+    """
+    model = ModelParallelClassificationModel()
+    dm = ClassifDataModule()
+    trainer = Trainer(
+        max_epochs=2,
+        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        fast_dev_run=True,
+        precision=16,
+    )
+    trainer.fit(model, datamodule=dm)
+
+    trainer = Trainer(
+        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        fast_dev_run=True,
+        precision=16,
+        resume_from_checkpoint=trainer.checkpoint_callback.best_model_path,
+    )
+    trainer.fit(model, datamodule=dm)
+
+    _assert_save_model_is_equal(model, tmpdir, trainer)
+
+
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
     """

From d273393ece9c207b02f02c38e3eb8ddb71b32605 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 16 Mar 2021 10:39:51 +0000
Subject: [PATCH 11/60] change docstring

---
 tests/plugins/test_deepspeed_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 7a8e226e3ca56..47ff338063dea 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -414,7 +414,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     """
-        Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
+        Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
     """
     model = ModelParallelClassificationModel()
     dm = ClassifDataModule()

From c91d12854eba5b23a27eb042b12240b1a24f3b66 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 16 Mar 2021 18:33:42 +0000
Subject: [PATCH 12/60] resolve accumulate_grad_batches

---
 .../plugins/training_type/deepspeed.py        | 65 +++++++++++++++---
 .../training_type/training_type_plugin.py     |  3 +
 .../connectors/checkpoint_connector.py        |  2 +
 pytorch_lightning/trainer/training_loop.py    |  6 +-
 test.json                                     |  1 +
 tests/helpers/simple_models.py                |  7 +-
 tests/plugins/test_deepspeed_plugin.py        | 66 ++++++++++++++++---
 7 files changed, 129 insertions(+), 21 deletions(-)
 create mode 100644 test.json

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index be6df07ba22b3..338dc7540e8fb 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from pytorch_lightning.callbacks import GradientAccumulationScheduler
 
 import torch
 
@@ -237,6 +238,8 @@ def init_deepspeed(self):
             self._format_config()
             self._config_initialized = True
 
+        self._handle_gradient_accumulation_steps()
+
         precision = self.lightning_module.trainer.accelerator.precision
         model = LightningDeepSpeedModule(pl_module=self.model, precision=precision)
 
@@ -287,6 +290,7 @@ def _initialize_deepspeed_train(self, model):
 
         # set optimizer for save/load, but deepspeed manages the specific optimizer logic
         self.lightning_module.trainer.optimizers = [optimizer]
+        self.lightning_module.trainer.schedulers = [lr_scheduler]
         self.model = model
 
     def _call_model_parallel_setup(self):
@@ -306,14 +310,29 @@ def _set_deepspeed_activation_checkpointing(self):
             )
 
     def _initialize_deepspeed_inference(self, model):
+        optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
+        if "optimizer" not in self.config:
+            rank_zero_info(
+                "You have not specified an optimizer or scheduler within the DeepSpeed config."
+                "Using `configure_optimizers` to define optimizer and scheduler."
+            )
+            optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         inference_config = {
             'train_micro_batch_size_per_gpu': 1,
             'fp16': self.config['fp16'],
         }
+        if self.zero_stage_3:
+            inference_config.update({
+                "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'],
+                "zero_optimization": self.config['zero_optimization'],
+            })
         model, _, _, _ = deepspeed.initialize(
             args=SimpleNamespace(local_rank=self.local_rank),
             model=model,
+            optimizer=optimizer,
+            lr_scheduler=lightning_scheduler,
             config_params=inference_config,
+            model_parameters=[],
         )
         self.model = model
 
@@ -344,6 +363,13 @@ def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Calla
         # internally, the engine has a reference to the optimizer already.
         self.model.step(**kwargs)
 
+    def _handle_gradient_accumulation_steps(self):
+        if self.config.get("gradient_accumulation_steps") > 1:
+            self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches
+            self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1})
+        else:
+            self._original_accumulate_grad_batches = None
+
     def _format_config(self):
         if self.config is None:
             raise MisconfigurationException(
@@ -424,19 +450,24 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
             filepath: write-target file's path
             weights_only: saving model weights only
         """
-        # dump states as a checkpoint dictionary object
-        client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
-        save_dir = self._filepath_to_dir(filepath)
-        _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers']
-        client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys}
-        self.model.save_checkpoint(save_dir, client_state=client_state)
+        if torch.distributed.get_world_size() > 1:
+            # dump states as a checkpoint dictionary object
+            client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
+            save_dir = self._filepath_to_dir(filepath)
+            _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers']
+            client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys}
+            self.model.save_checkpoint(save_dir, client_state=client_state)
+        else:
+            self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath)
 
     def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
-        if torch.distributed.is_available():
+        if torch.distributed.get_world_size() > 1:
             from pytorch_lightning.trainer.states import TrainerState
             load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)
-            self.model.optimizer._partition_all_parameters() 
+            
+            if self.zero_stage_3:
+                self.model.optimizer._partition_all_parameters() 
 
             _, client_state = self.model.load_checkpoint(
                 save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states)
@@ -447,4 +478,20 @@ def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda
 
             # hook: give user access to checkpoint if needed.
             self.lightning_module.on_load_checkpoint(client_state)
-            return client_state, False
\ No newline at end of file
+            return client_state, False
+        else:
+            super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location)
+        return {}, False
+
+    def _accumulated_batches_reached(self, trainer):
+        return (trainer.total_batch_idx) % trainer.accumulate_grad_batches == 0
+
+    def increment_accumulated_grad_global_step(self, trainer):
+        if self._original_accumulate_grad_batches is None:
+            trainer.global_step += 1
+        else:
+            trainer.accumulate_grad_batches = self._original_accumulate_grad_batches
+            #print("increment_accumulated_grad_global_step", trainer.total_batch_idx, not self.should_accumulate(trainer), trainer.global_step, self.model.global_steps)
+            if self._accumulated_batches_reached(trainer):
+                trainer.global_step += 1
+            trainer.accumulate_grad_batches = 1
\ No newline at end of file
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 9e2de5ce61ab0..9588a0fc1b145 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -181,3 +181,6 @@ def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda
         self.lightning_module.on_load_checkpoint(ckpt)
         self.lightning_module.load_state_dict(ckpt['state_dict'])
         return ckpt, True
+
+    def increment_accumulated_grad_global_step(self, trainer) -> None:
+        trainer.global_step += 1
\ No newline at end of file
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 987d2dec65f13..23dcd01d63feb 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -98,6 +98,8 @@ def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
         if on_gpu:
             model.cuda(self.trainer.root_gpu)
 
+        print(checkpoint)
+
         # restore training state
         self.restore_training_state(checkpoint, load_optimizer_states)
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 88b87afcb9358..89e62cc51f543 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -547,6 +547,8 @@ def run_training_epoch(self):
             if self._num_training_batches_reached(is_last_batch):
                 break
 
+
+
             # progress global step according to grads progress
             self.increment_accumulated_grad_global_step()
 
@@ -632,6 +634,8 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
                         opt_idx=opt_idx,
                     )
 
+                    self.trainer.training_type_plugin.on_batch
+
                 # ------------------------------
                 # BACKWARD PASS
                 # ------------------------------
@@ -806,7 +810,7 @@ def increment_accumulated_grad_global_step(self):
 
         # progress global step according to grads progress
         if num_accumulated_batches_reached or num_training_batches_reached:
-            self.trainer.global_step += 1
+            self.trainer.training_type_plugin.increment_accumulated_grad_global_step(self.trainer)
 
     def _accumulated_batches_reached(self):
         return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0
diff --git a/test.json b/test.json
new file mode 100644
index 0000000000000..d6a14fb12477c
--- /dev/null
+++ b/test.json
@@ -0,0 +1 @@
+{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 3, 'cpu_offload': True, 'cpu_offload_params': False, 'cpu_offload_use_pin_memory': False, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000.0, 'reduce_bucket_size': 200000000.0}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'train_micro_batch_size_per_gpu': 10, 'gradient_accumulation_steps': 1, 'gradient_clipping': 0, 'fp16': {'enabled': True, 'loss_scale': 0, 'initial_scale_power': 32, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}}
\ No newline at end of file
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index a35ab2fc4a2fe..85f1968bee7eb 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -14,6 +14,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+from torch import distributed as dist
 
 from pytorch_lightning import LightningModule
 from pytorch_lightning.metrics import Accuracy, MeanSquaredError
@@ -54,20 +55,20 @@ def training_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
         self.log('train_loss', loss, prog_bar=True)
-        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
+        #self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
+        # self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
+        # self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 47ff338063dea..b91a178473e5a 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,14 +1,15 @@
 import json
 import os
+from pytorch_lightning.callbacks.base import Callback
 from pytorch_lightning.core import datamodule
-
+from typing import Any
 import pytest
 import torch
 from torch import Tensor
 from torch.optim import Optimizer
 from torch import nn
 from pytorch_lightning.metrics import Accuracy 
-from pytorch_lightning import Trainer
+from pytorch_lightning import Trainer, callbacks
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -16,6 +17,7 @@
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.simple_models import ClassificationModel
 from tests.helpers.runif import RunIf
+from pytorch_lightning import LightningModule
 
 
 def test_deepspeed_lightning_module(tmpdir):
@@ -389,6 +391,10 @@ def on_model_parallel_setup(self) -> None:
             setattr(self, f"layer_{i}a", nn.ReLU())
         setattr(self, "layer_end", nn.Linear(32, 3))
 
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+        return [optimizer], [lr_scheduler]
 
 
 @RunIf(min_gpus=2, deepspeed=True)
@@ -410,7 +416,6 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-@pytest.mark.skipif("Currently failing")
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     """
@@ -420,25 +425,70 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     dm = ClassifDataModule()
     trainer = Trainer(
         max_epochs=2,
-        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)],
+        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)],
         default_root_dir=tmpdir,
         gpus=2,
-        fast_dev_run=True,
+        limit_val_batches=2,
+        limit_test_batches=2,
         precision=16,
+        accumulate_grad_batches=2,
     )
     trainer.fit(model, datamodule=dm)
 
     trainer = Trainer(
-        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)],
+        max_epochs=3,
+        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)],
         default_root_dir=tmpdir,
         gpus=2,
-        fast_dev_run=True,
+        limit_val_batches=2,
+        limit_test_batches=2,
         precision=16,
         resume_from_checkpoint=trainer.checkpoint_callback.best_model_path,
     )
     trainer.fit(model, datamodule=dm)
+    trainer.test(datamodule=dm)
 
-    _assert_save_model_is_equal(model, tmpdir, trainer)
+
+@RunIf(min_gpus=2, deepspeed=True)
+def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, deepspeed_config):
+    """
+        Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
+    """
+    class VerificationCallback(Callback):
+
+        def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
+            deepspeed_engine = trainer.training_type_plugin.model
+            assert trainer.global_step == deepspeed_engine.global_steps
+
+
+    model = ModelParallelClassificationModel()
+    dm = ClassifDataModule()
+    trainer = Trainer(
+        max_epochs=2,
+        plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        precision=16,
+        accumulate_grad_batches=3,
+        callbacks=[VerificationCallback()]
+    )
+    trainer.fit(model, datamodule=dm)
+
+    trainer = Trainer(
+        max_epochs=3,
+        plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        limit_val_batches=2,
+        limit_test_batches=2,
+        precision=16,
+        resume_from_checkpoint=trainer.checkpoint_callback.best_model_path,
+        callbacks=[VerificationCallback()]
+    )
+    trainer.fit(model, datamodule=dm)
+    trainer.test(datamodule=dm)
 
 
 @RunIf(min_gpus=2, deepspeed=True)

From 959d7b7c5e15f4e7ec1fa7913c98c74975e83339 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 16 Mar 2021 18:40:31 +0000
Subject: [PATCH 13/60] resolve flake8

---
 .../plugins/training_type/deepspeed.py        | 20 ++++++++++--------
 .../training_type/training_type_plugin.py     | 10 +++++----
 pytorch_lightning/trainer/trainer.py          |  9 ++++----
 pytorch_lightning/trainer/training_loop.py    |  2 --
 test.json                                     |  1 -
 tests/helpers/simple_models.py                |  9 ++++----
 tests/plugins/test_deepspeed_plugin.py        | 21 ++++++++++---------
 7 files changed, 36 insertions(+), 36 deletions(-)
 delete mode 100644 test.json

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 338dc7540e8fb..fc99866cefe49 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -18,10 +18,10 @@
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-from pytorch_lightning.callbacks import GradientAccumulationScheduler
 
 import torch
 
+from pytorch_lightning.callbacks import GradientAccumulationScheduler
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
@@ -455,23 +455,26 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
             client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
             save_dir = self._filepath_to_dir(filepath)
             _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers']
-            client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys}
+            client_state = {k: v for k, v in client_state.items() if k not in _exclude_keys}
             self.model.save_checkpoint(save_dir, client_state=client_state)
         else:
             self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath)
 
-    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
+    def restore_model_state_from_ckpt_path(self,
+                                           ckpt_path: str,
+                                           map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
         if torch.distributed.get_world_size() > 1:
             from pytorch_lightning.trainer.states import TrainerState
             load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)
-            
+
             if self.zero_stage_3:
-                self.model.optimizer._partition_all_parameters() 
+                self.model.optimizer._partition_all_parameters()
 
             _, client_state = self.model.load_checkpoint(
-                save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states)
-            
+                save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states
+            )
+
             # restore datamodule states
             if self.lightning_module.trainer.datamodule is not None:
                 self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state)
@@ -491,7 +494,6 @@ def increment_accumulated_grad_global_step(self, trainer):
             trainer.global_step += 1
         else:
             trainer.accumulate_grad_batches = self._original_accumulate_grad_batches
-            #print("increment_accumulated_grad_global_step", trainer.total_batch_idx, not self.should_accumulate(trainer), trainer.global_step, self.model.global_steps)
             if self._accumulated_batches_reached(trainer):
                 trainer.global_step += 1
-            trainer.accumulate_grad_batches = 1
\ No newline at end of file
+            trainer.accumulate_grad_batches = 1
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 9588a0fc1b145..49c337e936b8a 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union, Tuple
-from pytorch_lightning.utilities.cloud_io import load as pl_load
+from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
 from torch.nn import Module
@@ -23,6 +22,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.plugins.base_plugin import Plugin
+from pytorch_lightning.utilities.cloud_io import load as pl_load
 
 if TYPE_CHECKING:
     from pytorch_lightning.trainer.trainer import Trainer
@@ -171,7 +171,9 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule):
     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
         optimizer.step(closure=lambda_closure, **kwargs)
 
-    def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
+    def restore_model_state_from_ckpt_path(self,
+                                           ckpt_path: str,
+                                           map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
         ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
         # restore datamodule states
         if self.lightning_module.trainer.datamodule is not None:
@@ -183,4 +185,4 @@ def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda
         return ckpt, True
 
     def increment_accumulated_grad_global_step(self, trainer) -> None:
-        trainer.global_step += 1
\ No newline at end of file
+        trainer.global_step += 1
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 8ebab2e110aca..acdf226cfa89b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -58,7 +58,6 @@
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.tuner.tuning import Tuner
 from pytorch_lightning.utilities import rank_zero_warn
-from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import recursive_detach
@@ -922,9 +921,7 @@ def test(
 
         # If you supply a datamodule you can't supply test_dataloaders
         if test_dataloaders and datamodule:
-            raise MisconfigurationException(
-                'You cannot pass both `trainer.test(test_dataloaders=..., datamodule=...)`'
-            )
+            raise MisconfigurationException('You cannot pass both `trainer.test(test_dataloaders=..., datamodule=...)`')
 
         model_provided = model is not None
         model = model or self.lightning_module
@@ -971,7 +968,9 @@ def __load_ckpt_weights(
 
             self.training_type_plugin.barrier()
 
-            self.training_type_plugin.restore_model_state_from_ckpt_path(ckpt_path, map_location=lambda storage, loc: storage)
+            self.training_type_plugin.restore_model_state_from_ckpt_path(
+                ckpt_path, map_location=lambda storage, loc: storage
+            )
         return ckpt_path
 
     def predict(
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 89e62cc51f543..66a44704ad19a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -547,8 +547,6 @@ def run_training_epoch(self):
             if self._num_training_batches_reached(is_last_batch):
                 break
 
-
-
             # progress global step according to grads progress
             self.increment_accumulated_grad_global_step()
 
diff --git a/test.json b/test.json
deleted file mode 100644
index d6a14fb12477c..0000000000000
--- a/test.json
+++ /dev/null
@@ -1 +0,0 @@
-{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 3, 'cpu_offload': True, 'cpu_offload_params': False, 'cpu_offload_use_pin_memory': False, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000.0, 'reduce_bucket_size': 200000000.0}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'train_micro_batch_size_per_gpu': 10, 'gradient_accumulation_steps': 1, 'gradient_clipping': 0, 'fp16': {'enabled': True, 'loss_scale': 0, 'initial_scale_power': 32, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}}
\ No newline at end of file
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index 85f1968bee7eb..a1b6c03fd00d8 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -14,7 +14,6 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch import distributed as dist
 
 from pytorch_lightning import LightningModule
 from pytorch_lightning.metrics import Accuracy, MeanSquaredError
@@ -28,7 +27,7 @@ def __init__(self, lr=0.01):
         self.lr = lr
         for i in range(3):
             setattr(self, f"layer_{i}", nn.Linear(32, 32))
-            setattr(self, f"layer_{i}a", torch.nn.ReLU())
+            setattr(self, f"layer_{i}a", nn.ReLU())
         setattr(self, "layer_end", nn.Linear(32, 3))
 
         self.train_acc = Accuracy()
@@ -55,20 +54,20 @@ def training_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
         self.log('train_loss', loss, prog_bar=True)
-        #self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
-        # self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
-        # self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index b91a178473e5a..125c33ab47f4d 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,23 +1,22 @@
 import json
 import os
-from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.core import datamodule
 from typing import Any
+
 import pytest
 import torch
-from torch import Tensor
+from torch import nn, Tensor
 from torch.optim import Optimizer
-from torch import nn
-from pytorch_lightning.metrics import Accuracy 
-from pytorch_lightning import Trainer, callbacks
+
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.metrics import Accuracy
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
-from tests.helpers.simple_models import ClassificationModel
 from tests.helpers.runif import RunIf
-from pytorch_lightning import LightningModule
+from tests.helpers.simple_models import ClassificationModel
 
 
 def test_deepspeed_lightning_module(tmpdir):
@@ -454,13 +453,15 @@ def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir,
     """
         Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
     """
+
     class VerificationCallback(Callback):
 
-        def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
+        def on_train_batch_start(
+            self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
+        ) -> None:
             deepspeed_engine = trainer.training_type_plugin.model
             assert trainer.global_step == deepspeed_engine.global_steps
 
-
     model = ModelParallelClassificationModel()
     dm = ClassifDataModule()
     trainer = Trainer(

From f0cb6e744b1c6e0c40fb615620a2020a72614394 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 17 Mar 2021 10:13:27 +0000
Subject: [PATCH 14/60] Update DeepSpeed to use latest version, add some
 comments

---
 dockers/base-cuda/Dockerfile                    |  4 +---
 .../plugins/training_type/deepspeed.py          | 17 ++++++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 843e47ca91289..d7c13e7560010 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -114,9 +114,7 @@ RUN \
     rm -rf apex
 
 RUN \
-    # install DeepSpeed from source.
-    # todo: swap to pypi release once DeepSpeed releases a new version >= 0.3.10
-    pip install deepspeed@git+https://github.com/microsoft/DeepSpeed@ec8b1cb
+    pip install deepspeed==0.3.13
 
 RUN \
     # Show what we have
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index fc99866cefe49..d7b4704450e5c 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -279,7 +279,6 @@ def _initialize_deepspeed_train(self, model):
             optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
         model, optimizer, _, lr_scheduler = deepspeed.initialize(
-            args=SimpleNamespace(local_rank=self.local_rank),
             model=model,
             model_parameters=model_parameters,
             optimizer=optimizer,
@@ -310,6 +309,7 @@ def _set_deepspeed_activation_checkpointing(self):
             )
 
     def _initialize_deepspeed_inference(self, model):
+        # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly
         optimizer, lightning_scheduler, optimizer_frequencies = None, None, None
         if "optimizer" not in self.config:
             rank_zero_info(
@@ -318,6 +318,7 @@ def _initialize_deepspeed_inference(self, model):
             )
             optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         inference_config = {
+            # todo: this is required for DeepSpeed throughput timers
             'train_micro_batch_size_per_gpu': 1,
             'fp16': self.config['fp16'],
         }
@@ -441,7 +442,11 @@ def _create_default_config(
         return cfg
 
     def _filepath_to_dir(self, filepath: str):
-        return filepath.split('.')[0]
+        return os.path.dirname(filepath)
+
+    @property
+    def deepspeed_engine(self):
+        return self.model
 
     def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
@@ -451,12 +456,13 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
             weights_only: saving model weights only
         """
         if torch.distributed.get_world_size() > 1:
+            # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
             # dump states as a checkpoint dictionary object
             client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
             save_dir = self._filepath_to_dir(filepath)
             _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers']
             client_state = {k: v for k, v in client_state.items() if k not in _exclude_keys}
-            self.model.save_checkpoint(save_dir, client_state=client_state)
+            self.deepspeed_engine.save_checkpoint(save_dir, client_state=client_state)
         else:
             self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath)
 
@@ -469,7 +475,8 @@ def restore_model_state_from_ckpt_path(self,
             save_dir = self._filepath_to_dir(ckpt_path)
 
             if self.zero_stage_3:
-                self.model.optimizer._partition_all_parameters()
+                # TODO: Currently required as this call is missing within the deepspeed engine.
+                self.deepspeed_engine.optimizer._partition_all_parameters()
 
             _, client_state = self.model.load_checkpoint(
                 save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states
@@ -487,7 +494,7 @@ def restore_model_state_from_ckpt_path(self,
         return {}, False
 
     def _accumulated_batches_reached(self, trainer):
-        return (trainer.total_batch_idx) % trainer.accumulate_grad_batches == 0
+        return trainer.total_batch_idx % trainer.accumulate_grad_batches == 0
 
     def increment_accumulated_grad_global_step(self, trainer):
         if self._original_accumulate_grad_batches is None:

From 914de8670062cb362384c77d0db41262b6fcc0fc Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Wed, 17 Mar 2021 10:56:40 +0000
Subject: [PATCH 15/60] add metrics

---
 .../plugins/training_type/deepspeed.py        |  7 +-
 .../connectors/checkpoint_connector.py        |  2 +-
 tests/helpers/pipelines.py                    |  2 +-
 tests/helpers/simple_models.py                |  6 +-
 tests/plugins/test_deepspeed_plugin.py        | 92 +++++++++++++------
 5 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 338dc7540e8fb..609e3f3e570aa 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -297,6 +297,8 @@ def _call_model_parallel_setup(self):
         if self.zero_stage_3:
             with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
                 self.lightning_module.trainer.call_hook("on_model_parallel_setup")
+        else:
+            self.lightning_module.trainer.call_hook("on_model_parallel_setup")
 
     def _set_deepspeed_activation_checkpointing(self):
         if self.config.get('activation_checkpointing'):
@@ -319,8 +321,11 @@ def _initialize_deepspeed_inference(self, model):
             optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         inference_config = {
             'train_micro_batch_size_per_gpu': 1,
-            'fp16': self.config['fp16'],
         }
+        if 'fp16' in self.config:
+            inference_config.update({
+                "fp16": self.config["fp16"]
+            })
         if self.zero_stage_3:
             inference_config.update({
                 "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'],
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 23dcd01d63feb..c68951f9a66c6 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -121,7 +121,7 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         # restore model state_dict
         model.load_state_dict(checkpoint['state_dict'])
 
-    def restore_training_state(self, checkpoint, load_optimizer_states: bool):
+    def restore_training_state(self, checkpoint, load_optimizer_states: bool = True):
         """
         Restore trainer state.
         Model will get its change to update
diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py
index 403bcdfee8c1d..ebacad05b0a6f 100644
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@@ -100,7 +100,7 @@ def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50):
     # run prediction on 1 batch
     trained_model.cpu()
     trained_model.eval()
-
+    
     batch = next(iter(dataloader))
     x, y = batch
     x = x.flatten(1)
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index 85f1968bee7eb..f71f3df16df33 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -55,20 +55,20 @@ def training_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
         self.log('train_loss', loss, prog_bar=True)
-        #self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
-        # self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
-        # self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index b91a178473e5a..fef6b3af9f8dd 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,21 +1,20 @@
 import json
 import os
-from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.core import datamodule
+from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from typing import Any
 import pytest
 import torch
 from torch import Tensor
 from torch.optim import Optimizer
 from torch import nn
+import torch.nn.functional as F
 from pytorch_lightning.metrics import Accuracy 
-from pytorch_lightning import Trainer, callbacks
+from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
-from tests.helpers.simple_models import ClassificationModel
 from tests.helpers.runif import RunIf
 from pytorch_lightning import LightningModule
 
@@ -375,22 +374,56 @@ def on_model_parallel_setup(self) -> None:
         self.linear = torch.nn.Linear(32, 2)
 
 
-class ModelParallelClassificationModel(ClassificationModel):
+class ModelParallelClassificationModel(LightningModule):
 
     def __init__(self, lr=0.01):
         super().__init__()
-
         self.lr = lr
+
         self.train_acc = Accuracy()
         self.valid_acc = Accuracy()
         self.test_acc = Accuracy()
 
+    def make_block(self):
+        return nn.Sequential(nn.Linear(32, 32, bias=False), nn.BatchNorm1d(32), nn.LeakyReLU())
+
     def on_model_parallel_setup(self) -> None:
         for i in range(3):
-            setattr(self, f"layer_{i}", nn.Linear(32, 32))
-            setattr(self, f"layer_{i}a", nn.ReLU())
+            setattr(self, f"block_{i}", self.make_block())
         setattr(self, "layer_end", nn.Linear(32, 3))
 
+    def forward(self, x):
+        x = self.block_0(x)
+        x = self.block_1(x)
+        x = self.block_2(x)
+        x = self.layer_end(x)
+        logits = F.softmax(x, dim=1)
+        return logits
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return [optimizer], []
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self.forward(x)
+        loss = F.cross_entropy(logits, y)
+        self.log('train_loss', loss, prog_bar=True)
+        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
+        return {"loss": loss}
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self.forward(x)
+        self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self.forward(x)
+        self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
+        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
+
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
         lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
@@ -421,32 +454,33 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     """
         Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
     """
-    model = ModelParallelClassificationModel()
+    seed_everything(42)
+    model = ModelParallelClassificationModel(lr=0.1)
     dm = ClassifDataModule()
+    ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
     trainer = Trainer(
-        max_epochs=2,
-        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)],
+        max_epochs=10,
+        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)],
         default_root_dir=tmpdir,
         gpus=2,
-        limit_val_batches=2,
-        limit_test_batches=2,
         precision=16,
         accumulate_grad_batches=2,
+        callbacks=[ck]
     )
     trainer.fit(model, datamodule=dm)
-
+    results = trainer.test(model, datamodule=dm)
+    print(results)
+    results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
+    print(results)
     trainer = Trainer(
-        max_epochs=3,
+        max_epochs=1,
         plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)],
         default_root_dir=tmpdir,
         gpus=2,
-        limit_val_batches=2,
-        limit_test_batches=2,
         precision=16,
-        resume_from_checkpoint=trainer.checkpoint_callback.best_model_path,
+        resume_from_checkpoint=ck.best_model_path,
     )
     trainer.fit(model, datamodule=dm)
-    trainer.test(datamodule=dm)
 
 
 @RunIf(min_gpus=2, deepspeed=True)
@@ -454,27 +488,26 @@ def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir,
     """
         Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
     """
+    seed_everything(42)
     class VerificationCallback(Callback):
 
         def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
             deepspeed_engine = trainer.training_type_plugin.model
             assert trainer.global_step == deepspeed_engine.global_steps
 
-
-    model = ModelParallelClassificationModel()
+    model = ModelParallelClassificationModel(lr=0.1)
     dm = ClassifDataModule()
     trainer = Trainer(
-        max_epochs=2,
-        plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)],
-        default_root_dir=tmpdir,
+        max_epochs=5,
+        plugins=[DeepSpeedPlugin(stage=2, zero_optimization=False, cpu_offload=True)],
         gpus=2,
         limit_val_batches=2,
-        limit_test_batches=2,
-        precision=16,
-        accumulate_grad_batches=3,
+        precision=32,
+        accumulate_grad_batches=2,
         callbacks=[VerificationCallback()]
     )
     trainer.fit(model, datamodule=dm)
+    results = trainer.test(datamodule=dm)
 
     trainer = Trainer(
         max_epochs=3,
@@ -487,8 +520,9 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any,
         resume_from_checkpoint=trainer.checkpoint_callback.best_model_path,
         callbacks=[VerificationCallback()]
     )
-    trainer.fit(model, datamodule=dm)
-    trainer.test(datamodule=dm)
+    results = trainer.test(model, datamodule=dm)
+    # todo (tchaton) resolve different metrics
+    print(results)
 
 
 @RunIf(min_gpus=2, deepspeed=True)

From 712814c5e4b925395be46a99ab9b3498e565e8da Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Wed, 17 Mar 2021 11:00:03 +0000
Subject: [PATCH 16/60] update

---
 tests/plugins/test_deepspeed_plugin.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index b89c2262e111f..5895c0761edf8 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,16 +1,16 @@
 import json
 import os
-from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from typing import Any
 
 import pytest
 import torch
+import torch.nn.functional as F
 from torch import nn, Tensor
 from torch.optim import Optimizer
-from torch import nn
-import torch.nn.functional as F
-from pytorch_lightning.metrics import Accuracy 
-from pytorch_lightning import Trainer, seed_everything
+
+from pytorch_lightning import LightningModule, seed_everything, Trainer
+from pytorch_lightning.callbacks import Callback, ModelCheckpoint
+from pytorch_lightning.metrics import Accuracy
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -400,10 +400,6 @@ def forward(self, x):
         logits = F.softmax(x, dim=1)
         return logits
 
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
-        return [optimizer], []
-
     def training_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
@@ -489,11 +485,13 @@ def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir,
         Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
     """
     seed_everything(42)
+
     class VerificationCallback(Callback):
 
         def on_train_batch_start(
             self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
         ) -> None:
+
             deepspeed_engine = trainer.training_type_plugin.model
             assert trainer.global_step == deepspeed_engine.global_steps
 

From a1644c63885beb1e55adc484154360c84167a0e1 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 17 Mar 2021 11:15:13 +0000
Subject: [PATCH 17/60] Small formatting fixes, clean up some code

---
 .../connectors/checkpoint_connector.py        |  5 +-
 tests/helpers/pipelines.py                    |  2 +-
 tests/helpers/simple_models.py                |  8 ++--
 tests/plugins/test_deepspeed_plugin.py        | 47 +++++++++----------
 4 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index c68951f9a66c6..09f9d1515f58f 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -91,15 +91,14 @@ def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
             return False
 
         checkpoint, load_optimizer_states = self.trainer.training_type_plugin.restore_model_state_from_ckpt_path(
-            checkpoint_path, map_location=lambda storage, loc: storage)
+            checkpoint_path, map_location=lambda storage, loc: storage
+        )
 
         model = self.trainer.lightning_module
 
         if on_gpu:
             model.cuda(self.trainer.root_gpu)
 
-        print(checkpoint)
-
         # restore training state
         self.restore_training_state(checkpoint, load_optimizer_states)
 
diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py
index ebacad05b0a6f..403bcdfee8c1d 100644
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@@ -100,7 +100,7 @@ def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50):
     # run prediction on 1 batch
     trained_model.cpu()
     trained_model.eval()
-    
+
     batch = next(iter(dataloader))
     x, y = batch
     x = x.flatten(1)
diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py
index a1b6c03fd00d8..1abeb1f00206a 100644
--- a/tests/helpers/simple_models.py
+++ b/tests/helpers/simple_models.py
@@ -27,7 +27,7 @@ def __init__(self, lr=0.01):
         self.lr = lr
         for i in range(3):
             setattr(self, f"layer_{i}", nn.Linear(32, 32))
-            setattr(self, f"layer_{i}a", nn.ReLU())
+            setattr(self, f"layer_{i}a", torch.nn.ReLU())
         setattr(self, "layer_end", nn.Linear(32, 3))
 
         self.train_acc = Accuracy()
@@ -54,20 +54,20 @@ def training_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
         self.log('train_loss', loss, prog_bar=True)
-        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('train_acc', self.train_acc(logits, y), prog_bar=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('val_acc', self.valid_acc(logits, y), prog_bar=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
         self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('test_acc', self.test_acc(logits, y), prog_bar=True)
 
 
 class RegressionModel(LightningModule):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 5895c0761edf8..cfb212651101a 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -21,7 +21,7 @@
 
 def test_deepspeed_lightning_module(tmpdir):
     """
-        Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.
+    Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.
     """
 
     model = BoringModel()
@@ -39,7 +39,7 @@ def test_deepspeed_lightning_module(tmpdir):
 @RunIf(min_gpus=1)
 def test_deepspeed_lightning_module_precision(tmpdir):
     """
-        Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16.
+    Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16.
     """
 
     model = BoringModel()
@@ -89,7 +89,7 @@ def deepspeed_zero_config(deepspeed_config):
 @pytest.mark.parametrize("input", ("deepspeed", DeepSpeedPlugin))
 def test_deepspeed_plugin_string(tmpdir, input):
     """
-        Test to ensure that the plugin can be passed via string or instance, and parallel devices is correctly set.
+    Test to ensure that the plugin can be passed via string or instance, and parallel devices is correctly set.
     """
 
     trainer = Trainer(
@@ -133,8 +133,8 @@ def test_deepspeed_plugin_env(tmpdir, monkeypatch, deepspeed_config):
 )
 def test_deepspeed_precision_choice(amp_backend, tmpdir):
     """
-        Test to ensure precision plugin is also correctly chosen.
-        DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin
+    Test to ensure precision plugin is also correctly chosen.
+    DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin
     """
 
     trainer = Trainer(
@@ -165,7 +165,7 @@ def test_deepspeed_with_invalid_config_path(tmpdir):
 @RunIf(deepspeed=True)
 def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config):
     """
-        Test to ensure if we pass an env variable, we load the config from the path.
+    Test to ensure if we pass an env variable, we load the config from the path.
     """
     config_path = os.path.join(tmpdir, 'temp.json')
     with open(config_path, 'w') as f:
@@ -223,8 +223,10 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
 
 @RunIf(min_gpus=1, deepspeed=True)
 def test_deepspeed_run_configure_optimizers(tmpdir):
-    """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
-        whilst using configure_optimizers for optimizers and schedulers."""
+    """
+    Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
+    whilst using configure_optimizers for optimizers and schedulers.
+    """
 
     class TestModel(BoringModel):
 
@@ -254,8 +256,8 @@ def on_train_start(self) -> None:
 @RunIf(min_gpus=1, deepspeed=True)
 def test_deepspeed_config(tmpdir, deepspeed_zero_config):
     """
-        Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
-        and saves the model weights to load correctly.
+    Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
+    and saves the model weights to load correctly.
     """
 
     class TestModel(BoringModel):
@@ -348,7 +350,7 @@ def on_train_start(self) -> None:
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu(tmpdir, deepspeed_config):
     """
-        Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
+    Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
     """
     model = BoringModel()
     trainer = Trainer(
@@ -376,9 +378,10 @@ def on_model_parallel_setup(self) -> None:
 
 class ModelParallelClassificationModel(LightningModule):
 
-    def __init__(self, lr=0.01):
+    def __init__(self, lr: float = 0.01, num_blocks: int = 3):
         super().__init__()
         self.lr = lr
+        self.num_blocks = num_blocks
 
         self.train_acc = Accuracy()
         self.valid_acc = Accuracy()
@@ -388,15 +391,10 @@ def make_block(self):
         return nn.Sequential(nn.Linear(32, 32, bias=False), nn.BatchNorm1d(32), nn.LeakyReLU())
 
     def on_model_parallel_setup(self) -> None:
-        for i in range(3):
-            setattr(self, f"block_{i}", self.make_block())
-        setattr(self, "layer_end", nn.Linear(32, 3))
+        self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3))
 
     def forward(self, x):
-        x = self.block_0(x)
-        x = self.block_1(x)
-        x = self.block_2(x)
-        x = self.layer_end(x)
+        x = self.model(x)
         logits = F.softmax(x, dim=1)
         return logits
 
@@ -429,7 +427,7 @@ def configure_optimizers(self):
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     """
-        Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation.
+    Test to ensure ZeRO Stage 3 works with a parallel model.
     """
     model = ModelParallelBoringModel()
     trainer = Trainer(
@@ -448,7 +446,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     """
-        Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
+    Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
     """
     seed_everything(42)
     model = ModelParallelClassificationModel(lr=0.1)
@@ -480,9 +478,9 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
 
 
 @RunIf(min_gpus=2, deepspeed=True)
-def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, deepspeed_config):
+def test_deepspeed_multigpu_stage_2_checkpointing_accumulated_grad_batches(tmpdir, deepspeed_config):
     """
-        Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
+    Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
     """
     seed_everything(42)
 
@@ -491,7 +489,6 @@ class VerificationCallback(Callback):
         def on_train_batch_start(
             self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
         ) -> None:
-
             deepspeed_engine = trainer.training_type_plugin.model
             assert trainer.global_step == deepspeed_engine.global_steps
 
@@ -528,7 +525,7 @@ def on_train_batch_start(
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
     """
-        Test to ensure we can use DeepSpeed with just test.
+    Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3.
     """
     model = ModelParallelBoringModel()
     trainer = Trainer(

From 64f624f36161208bcb0157a271d7309529620686 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Wed, 17 Mar 2021 22:05:37 +0000
Subject: [PATCH 18/60] Few cleanups

---
 pytorch_lightning/trainer/training_loop.py | 2 --
 tests/plugins/test_deepspeed_plugin.py     | 7 ++++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 66a44704ad19a..5420bb038caca 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -632,8 +632,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx):
                         opt_idx=opt_idx,
                     )
 
-                    self.trainer.training_type_plugin.on_batch
-
                 # ------------------------------
                 # BACKWARD PASS
                 # ------------------------------
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index cfb212651101a..42158e46a8641 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -386,9 +386,10 @@ def __init__(self, lr: float = 0.01, num_blocks: int = 3):
         self.train_acc = Accuracy()
         self.valid_acc = Accuracy()
         self.test_acc = Accuracy()
+        self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3))
 
     def make_block(self):
-        return nn.Sequential(nn.Linear(32, 32, bias=False), nn.BatchNorm1d(32), nn.LeakyReLU())
+        return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU())
 
     def on_model_parallel_setup(self) -> None:
         self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3))
@@ -538,12 +539,12 @@ def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
     trainer.test(model)
 
 
-def _assert_save_model_is_equal(model, tmpdir, trainer):
+def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel):
     checkpoint_path = os.path.join(tmpdir, 'model.pt')
     trainer.save_checkpoint(checkpoint_path)
     # carry out the check only on rank 0
     if trainer.global_rank == 0:
-        saved_model = BoringModel.load_from_checkpoint(checkpoint_path)
+        saved_model = cls.load_from_checkpoint(checkpoint_path)
         if model.dtype == torch.half:
             saved_model = saved_model.half()  # model is loaded in float32 as default, move it to float16
         model = model.cpu()

From 89fbbcbb9fd763661f61683c005f63d68b3aad38 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 18 Mar 2021 10:48:33 +0000
Subject: [PATCH 19/60] No need for default state

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 2e255b4255875..ba4a1838b4c28 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -324,9 +324,7 @@ def _initialize_deepspeed_inference(self, model):
             'train_micro_batch_size_per_gpu': 1,
         }
         if 'fp16' in self.config:
-            inference_config.update({
-                "fp16": self.config["fp16"]
-            })
+            inference_config.update({"fp16": self.config["fp16"]})
         if self.zero_stage_3:
             inference_config.update({
                 "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'],
@@ -494,9 +492,7 @@ def restore_model_state_from_ckpt_path(self,
             # hook: give user access to checkpoint if needed.
             self.lightning_module.on_load_checkpoint(client_state)
             return client_state, False
-        else:
-            super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location)
-        return {}, False
+        return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location)
 
     def _accumulated_batches_reached(self, trainer):
         return trainer.total_batch_idx % trainer.accumulate_grad_batches == 0

From 701d41758fec1e1178588da24720f0b0b3f8a922 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 18 Mar 2021 12:40:07 +0000
Subject: [PATCH 20/60] Fix tests, add some boilerplate that should move
 eventually

---
 .../plugins/training_type/deepspeed.py        | 11 ++-
 tests/plugins/test_deepspeed_plugin.py        | 73 ++++++++-----------
 2 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index ba4a1838b4c28..0820f64aa3323 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -203,6 +203,7 @@ def __init__(
         self.loss_scale_window = loss_scale_window
         self.hysteresis = hysteresis
         self.min_loss_scale = min_loss_scale
+        self.on_model_parallel_setup_called = False
 
     def _load_config(self, config):
         if config is None and self.DEEPSPEED_ENV_VAR in os.environ:
@@ -293,11 +294,13 @@ def _initialize_deepspeed_train(self, model):
         self.model = model
 
     def _call_model_parallel_setup(self):
-        if self.zero_stage_3:
-            with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
+        if not self.on_model_parallel_setup_called:
+            if self.zero_stage_3:
+                with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
+                    self.lightning_module.trainer.call_hook("on_model_parallel_setup")
+            else:
                 self.lightning_module.trainer.call_hook("on_model_parallel_setup")
-        else:
-            self.lightning_module.trainer.call_hook("on_model_parallel_setup")
+            self.on_model_parallel_setup_called = True
 
     def _set_deepspeed_activation_checkpointing(self):
         if self.config.get('activation_checkpointing'):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 42158e46a8641..e740edc353165 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -378,7 +378,7 @@ def on_model_parallel_setup(self) -> None:
 
 class ModelParallelClassificationModel(LightningModule):
 
-    def __init__(self, lr: float = 0.01, num_blocks: int = 3):
+    def __init__(self, lr: float = 0.01, num_blocks: int = 5):
         super().__init__()
         self.lr = lr
         self.num_blocks = num_blocks
@@ -386,7 +386,6 @@ def __init__(self, lr: float = 0.01, num_blocks: int = 3):
         self.train_acc = Accuracy()
         self.valid_acc = Accuracy()
         self.test_acc = Accuracy()
-        self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3))
 
     def make_block(self):
         return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU())
@@ -396,6 +395,8 @@ def on_model_parallel_setup(self) -> None:
 
     def forward(self, x):
         x = self.model(x)
+        # Ensure output is in float32 for softmax operation
+        x = x.float()
         logits = F.softmax(x, dim=1)
         return logits
 
@@ -404,25 +405,29 @@ def training_step(self, batch, batch_idx):
         logits = self.forward(x)
         loss = F.cross_entropy(logits, y)
         self.log('train_loss', loss, prog_bar=True)
-        self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('train_acc', self.train_acc(logits, y), prog_bar=True, sync_dist=True)
         return {"loss": loss}
 
     def validation_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
-        self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False, sync_dist=True)
+        self.log('val_acc', self.valid_acc(logits, y), prog_bar=True, sync_dist=True)
 
     def test_step(self, batch, batch_idx):
         x, y = batch
         logits = self.forward(x)
-        self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False)
-        self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True)
+        self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False, sync_dist=True)
+        self.log('test_acc', self.test_acc(logits, y), prog_bar=True, sync_dist=True)
 
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
-        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
-        return [optimizer], [lr_scheduler]
+
+        lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
+        return [optimizer], [{
+            'scheduler': lr_scheduler,
+            'interval': 'step',
+        }]
 
 
 @RunIf(min_gpus=2, deepspeed=True)
@@ -447,15 +452,16 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
 @RunIf(min_gpus=2, deepspeed=True)
 def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     """
-    Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint
+    Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint,
+    and see convergence.
     """
     seed_everything(42)
-    model = ModelParallelClassificationModel(lr=0.1)
+    model = ModelParallelClassificationModel()
     dm = ClassifDataModule()
     ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1)
     trainer = Trainer(
         max_epochs=10,
-        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)],
+        plugins=[DeepSpeedPlugin(stage=3)],
         default_root_dir=tmpdir,
         gpus=2,
         precision=16,
@@ -463,23 +469,18 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
         callbacks=[ck]
     )
     trainer.fit(model, datamodule=dm)
+
     results = trainer.test(model, datamodule=dm)
-    print(results)
-    results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
-    print(results)
-    trainer = Trainer(
-        max_epochs=1,
-        plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)],
-        default_root_dir=tmpdir,
-        gpus=2,
-        precision=16,
-        resume_from_checkpoint=ck.best_model_path,
-    )
-    trainer.fit(model, datamodule=dm)
+    assert results[0]['test_acc'] > 0.7
+
+    saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm)
+    assert saved_results[0]['test_acc'] > 0.7
+    assert saved_results == results
 
 
 @RunIf(min_gpus=2, deepspeed=True)
-def test_deepspeed_multigpu_stage_2_checkpointing_accumulated_grad_batches(tmpdir, deepspeed_config):
+@pytest.mark.parametrize('cpu_offload', [True, False])
+def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, cpu_offload):
     """
     Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
     """
@@ -493,34 +494,18 @@ def on_train_batch_start(
             deepspeed_engine = trainer.training_type_plugin.model
             assert trainer.global_step == deepspeed_engine.global_steps
 
-    model = ModelParallelClassificationModel(lr=0.1)
+    model = ModelParallelClassificationModel()
     dm = ClassifDataModule()
     trainer = Trainer(
         max_epochs=5,
-        plugins=[DeepSpeedPlugin(stage=2, zero_optimization=False, cpu_offload=True)],
+        plugins=[DeepSpeedPlugin(stage=2, cpu_offload=cpu_offload)],
         gpus=2,
         limit_val_batches=2,
-        precision=32,
+        precision=16,
         accumulate_grad_batches=2,
         callbacks=[VerificationCallback()]
     )
     trainer.fit(model, datamodule=dm)
-    results = trainer.test(datamodule=dm)
-
-    trainer = Trainer(
-        max_epochs=3,
-        plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)],
-        default_root_dir=tmpdir,
-        gpus=2,
-        limit_val_batches=2,
-        limit_test_batches=2,
-        precision=16,
-        resume_from_checkpoint=trainer.checkpoint_callback.best_model_path,
-        callbacks=[VerificationCallback()]
-    )
-    results = trainer.test(model, datamodule=dm)
-    # todo (tchaton) resolve different metrics
-    print(results)
 
 
 @RunIf(min_gpus=2, deepspeed=True)

From 270d6ed04843e21537b410f43f66743c73311158 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 22 Mar 2021 17:41:39 +0000
Subject: [PATCH 21/60] Add hook removal

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 0820f64aa3323..3db03c5dd8761 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -35,6 +35,7 @@
 
 if _DEEPSPEED_AVAILABLE:
     import deepspeed
+    from deepspeed.runtime.zero.stage3 import remove_module_hooks
 
 
 class LightningDeepSpeedModule(_LightningModuleWrapperBase):
@@ -333,6 +334,8 @@ def _initialize_deepspeed_inference(self, model):
                 "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'],
                 "zero_optimization": self.config['zero_optimization'],
             })
+        # Remove all module hooks before initializing new model
+        remove_module_hooks(model)
         model, _, _, _ = deepspeed.initialize(
             args=SimpleNamespace(local_rank=self.local_rank),
             model=model,

From a236ff050cde4ff8c6f98a074e1ba5a1c5bd35d8 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 23 Mar 2021 10:51:04 +0000
Subject: [PATCH 22/60] Add a context manager to handle hook

---
 pytorch_lightning/accelerators/accelerator.py | 15 +++-
 .../plugins/training_type/deepspeed.py        | 82 +++++++++----------
 .../training_type/training_type_plugin.py     | 14 +++-
 pytorch_lightning/trainer/trainer.py          |  7 ++
 4 files changed, 73 insertions(+), 45 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 60e6ea88b4250..a5b89215b1e25 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union
+import contextlib
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union, Generator
 
 import torch
 from torch.optim import Optimizer
@@ -432,3 +433,15 @@ def results(self) -> Any:
         In distributed training, we make sure to transfer the results to the appropriate master process.
         """
         return self.training_type_plugin.results
+
+    @contextlib.contextmanager
+    def model_parallel_context(self) -> Generator:
+        """
+        Provide hook to create modules in a parallel aware context. This is useful for when we'd like to
+        shard the model instantly, which is useful for extremely large models which can save memory and
+        initialization time.
+
+        Returns: Model parallel context.
+        """
+        with self.training_type_plugin.model_parallel_context():
+            yield
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 00738a18f7e33..0a90b5ea0d05f 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import contextlib
 import json
 import logging
 import os
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Generator
 
 import torch
 
@@ -64,33 +64,33 @@ class DeepSpeedPlugin(DDPPlugin):
     DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH"
 
     def __init__(
-        self,
-        zero_optimization: bool = True,
-        stage: int = 2,
-        cpu_offload: bool = False,
-        cpu_offload_params: bool = False,
-        cpu_offload_use_pin_memory: bool = False,
-        contiguous_gradients: bool = True,
-        overlap_comm: bool = True,
-        allgather_partitions: bool = True,
-        reduce_scatter: bool = True,
-        allgather_bucket_size: int = 2e8,
-        reduce_bucket_size: int = 2e8,
-        zero_allow_untested_optimizer: bool = True,
-        config: Optional[Union[Path, str, dict]] = None,
-        logging_level: int = logging.WARN,
-        num_nodes: int = 1,
-        parallel_devices: Optional[List[torch.device]] = None,
-        cluster_environment: Optional[ClusterEnvironment] = None,
-        loss_scale: float = 0,
-        initial_scale_power: int = 32,
-        loss_scale_window: int = 1000,
-        hysteresis: int = 2,
-        min_loss_scale: int = 1,
-        partition_activations: bool = False,
-        cpu_checkpointing: bool = False,
-        contiguous_memory_optimization: bool = False,
-        synchronize_checkpoint_boundary: bool = False,
+            self,
+            zero_optimization: bool = True,
+            stage: int = 2,
+            cpu_offload: bool = False,
+            cpu_offload_params: bool = False,
+            cpu_offload_use_pin_memory: bool = False,
+            contiguous_gradients: bool = True,
+            overlap_comm: bool = True,
+            allgather_partitions: bool = True,
+            reduce_scatter: bool = True,
+            allgather_bucket_size: int = 2e8,
+            reduce_bucket_size: int = 2e8,
+            zero_allow_untested_optimizer: bool = True,
+            config: Optional[Union[Path, str, dict]] = None,
+            logging_level: int = logging.WARN,
+            num_nodes: int = 1,
+            parallel_devices: Optional[List[torch.device]] = None,
+            cluster_environment: Optional[ClusterEnvironment] = None,
+            loss_scale: float = 0,
+            initial_scale_power: int = 32,
+            loss_scale_window: int = 1000,
+            hysteresis: int = 2,
+            min_loss_scale: int = 1,
+            partition_activations: bool = False,
+            cpu_checkpointing: bool = False,
+            contiguous_memory_optimization: bool = False,
+            synchronize_checkpoint_boundary: bool = False,
     ) -> None:
         """
 
@@ -204,7 +204,6 @@ def __init__(
         self.loss_scale_window = loss_scale_window
         self.hysteresis = hysteresis
         self.min_loss_scale = min_loss_scale
-        self.on_model_parallel_setup_called = False
 
     def _load_config(self, config):
         if config is None and self.DEEPSPEED_ENV_VAR in os.environ:
@@ -237,8 +236,6 @@ def init_deepspeed(self):
         if self.on_gpu:
             torch.cuda.set_device(self.root_device)
 
-        self._call_model_parallel_setup()
-
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)
         else:
@@ -283,14 +280,13 @@ def _initialize_deepspeed_train(self, model):
         self.lightning_module.trainer.schedulers = [lr_scheduler]
         self.model = model
 
-    def _call_model_parallel_setup(self):
-        if not self.on_model_parallel_setup_called:
-            if self.zero_stage_3:
-                with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
-                    self.lightning_module.trainer.call_hook("on_model_parallel_setup")
-            else:
-                self.lightning_module.trainer.call_hook("on_model_parallel_setup")
-            self.on_model_parallel_setup_called = True
+    @contextlib.contextmanager
+    def model_parallel_context(self) -> Generator:
+        if self.zero_stage_3:
+            with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
+                yield
+        else:
+            super().model_parallel_context()
 
     def _set_deepspeed_activation_checkpointing(self):
         if self.config.get('activation_checkpointing'):
@@ -419,9 +415,9 @@ def _format_precision_config(self):
             raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.")
 
     def _create_default_config(
-        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool,
-        cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool,
-        **zero_kwargs
+            self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool,
+            cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool,
+            **zero_kwargs
     ) -> Dict:
         cfg = {
             'activation_checkpointing': {
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 08ccc8115d788..d6bcc16bddb1f 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union
+from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union, Generator
 
 import torch
 from torch.nn import Module
@@ -209,3 +210,14 @@ def restore_model_state_from_ckpt_path(self,
 
     def increment_accumulated_grad_global_step(self, trainer) -> None:
         trainer.global_step += 1
+
+    @contextlib.contextmanager
+    def model_parallel_context(self) -> Generator:
+        """
+        Provide hook to create modules in a parallel aware context. This is useful for when we'd like to
+        shard the model instantly, which is useful for extremely large models which can save memory and
+        initialization time.
+
+        Returns: Model parallel context.
+        """
+        yield
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e1472410fe347..34eee2a298eb6 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -432,6 +432,7 @@ def fit(
         self.accelerator.setup_environment()
         self.call_setup_hook(model)  # allow user to setup lightning_module in accelerator environment
         self.accelerator.setup(self, model)  # note: this sets up self.lightning_module
+        self.call_model_parallel_hook(model)  # allow user to setup in model parallel environment
 
         # ----------------------------
         # INSPECT THE CORE LOOPS
@@ -1075,6 +1076,12 @@ def call_setup_hook(self, model: LightningModule) -> None:
         self.setup(model, stage=state)
         model.setup(stage=state)
 
+    def call_model_parallel_hook(self, model: LightningModule) -> None:
+        if not hasattr(self.lightning_module, 'has_model_parallel_setup'):
+            with self.accelerator.model_parallel_context():
+                model.on_model_parallel_setup()
+            self.lightning_module.has_model_parallel_setup = True
+
     def call_teardown_hook(self, model: LightningModule) -> None:
         state = self._teardown_state
         self.profiler.teardown(stage=state)

From e1f865e221f02d6a247b1aaf29c1ccfd8f24502b Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Thu, 25 Mar 2021 16:25:25 +0000
Subject: [PATCH 23/60] Small naming cleanup

---
 .../plugins/training_type/deepspeed.py        | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 0a90b5ea0d05f..4e45896f2f7d4 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -17,7 +17,7 @@
 import os
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Generator
+from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import torch
 
@@ -64,33 +64,33 @@ class DeepSpeedPlugin(DDPPlugin):
     DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH"
 
     def __init__(
-            self,
-            zero_optimization: bool = True,
-            stage: int = 2,
-            cpu_offload: bool = False,
-            cpu_offload_params: bool = False,
-            cpu_offload_use_pin_memory: bool = False,
-            contiguous_gradients: bool = True,
-            overlap_comm: bool = True,
-            allgather_partitions: bool = True,
-            reduce_scatter: bool = True,
-            allgather_bucket_size: int = 2e8,
-            reduce_bucket_size: int = 2e8,
-            zero_allow_untested_optimizer: bool = True,
-            config: Optional[Union[Path, str, dict]] = None,
-            logging_level: int = logging.WARN,
-            num_nodes: int = 1,
-            parallel_devices: Optional[List[torch.device]] = None,
-            cluster_environment: Optional[ClusterEnvironment] = None,
-            loss_scale: float = 0,
-            initial_scale_power: int = 32,
-            loss_scale_window: int = 1000,
-            hysteresis: int = 2,
-            min_loss_scale: int = 1,
-            partition_activations: bool = False,
-            cpu_checkpointing: bool = False,
-            contiguous_memory_optimization: bool = False,
-            synchronize_checkpoint_boundary: bool = False,
+        self,
+        zero_optimization: bool = True,
+        stage: int = 2,
+        cpu_offload: bool = False,
+        cpu_offload_params: bool = False,
+        cpu_offload_use_pin_memory: bool = False,
+        contiguous_gradients: bool = True,
+        overlap_comm: bool = True,
+        allgather_partitions: bool = True,
+        reduce_scatter: bool = True,
+        allgather_bucket_size: int = 2e8,
+        reduce_bucket_size: int = 2e8,
+        zero_allow_untested_optimizer: bool = True,
+        config: Optional[Union[Path, str, dict]] = None,
+        logging_level: int = logging.WARN,
+        num_nodes: int = 1,
+        parallel_devices: Optional[List[torch.device]] = None,
+        cluster_environment: Optional[ClusterEnvironment] = None,
+        loss_scale: float = 0,
+        initial_scale_power: int = 32,
+        loss_scale_window: int = 1000,
+        hysteresis: int = 2,
+        min_loss_scale: int = 1,
+        partition_activations: bool = False,
+        cpu_checkpointing: bool = False,
+        contiguous_memory_optimization: bool = False,
+        synchronize_checkpoint_boundary: bool = False,
     ) -> None:
         """
 
@@ -415,9 +415,9 @@ def _format_precision_config(self):
             raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.")
 
     def _create_default_config(
-            self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool,
-            cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool,
-            **zero_kwargs
+        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool,
+        cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool,
+        **zero_kwargs
     ) -> Dict:
         cfg = {
             'activation_checkpointing': {
@@ -465,15 +465,15 @@ def restore_model_state_from_ckpt_path(self,
                                            map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
         if torch.distributed.get_world_size() > 1:
             from pytorch_lightning.trainer.states import TrainerState
-            load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING
+            stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)
 
             if self.zero_stage_3:
                 # TODO: Currently required as this call is missing within the deepspeed engine.
                 self.deepspeed_engine.optimizer._partition_all_parameters()
 
-            _, client_state = self.model.load_checkpoint(
-                save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states
+            _, client_state = self.deepspeed_engine.load_checkpoint(
+                save_dir, load_optimizer_states=stage_is_fit, load_lr_scheduler_states=stage_is_fit
             )
 
             # restore datamodule states

From 80fb792873088955bb411fbe29b4005375eef7d6 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Fri, 26 Mar 2021 09:34:36 +0000
Subject: [PATCH 24/60] wip

---
 .../plugins/training_type/deepspeed.py        | 21 ++++++++++++----
 tests/plugins/test_deepspeed_plugin.py        | 24 ++++++++++++++++++-
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 4e45896f2f7d4..12f2ae93e925c 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+from collections import OrderedDict
 import json
 import logging
 import os
@@ -35,7 +36,17 @@
 
 if _DEEPSPEED_AVAILABLE:
     import deepspeed
-    from deepspeed.runtime.zero.stage3 import remove_module_hooks
+    # from deepspeed.runtime.zero.stage3 import remove_module_hooks
+
+
+def remove_module_hooks(model: torch.nn.Module) -> None:
+    for module in model.modules():
+        module._backward_hooks = OrderedDict()
+        module._is_full_backward_hook = None
+        module._forward_hooks = OrderedDict()
+        module._forward_pre_hooks = OrderedDict()
+        module._state_dict_hooks = OrderedDict()
+        module._load_state_dict_pre_hooks = OrderedDict()
 
 
 class LightningDeepSpeedModule(_LightningModuleWrapperBase):
@@ -283,10 +294,12 @@ def _initialize_deepspeed_train(self, model):
     @contextlib.contextmanager
     def model_parallel_context(self) -> Generator:
         if self.zero_stage_3:
-            with deepspeed.zero.Init(remote_device="cpu", pin_memory=True):
-                yield
+            model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True)
         else:
-            super().model_parallel_context()
+            model_parallel_context = super().model_parallel_context()
+        
+        with model_parallel_context:
+            yield
 
     def _set_deepspeed_activation_checkpointing(self):
         if self.config.get('activation_checkpointing'):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 3fe121e2ef565..7c7e77df56285 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -420,6 +420,11 @@ def test_step(self, batch, batch_idx):
         self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False, sync_dist=True)
         self.log('test_acc', self.test_acc(logits, y), prog_bar=True, sync_dist=True)
 
+    def predict_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self.forward(x)
+        return self.test_acc(logits, y).compute()
+
     def configure_optimizers(self):
         optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
 
@@ -450,7 +455,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
 
 
 @RunIf(min_gpus=2, deepspeed=True)
-def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
+def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir):
     """
     Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint,
     and see convergence.
@@ -477,6 +482,23 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config):
     assert saved_results[0]['test_acc'] > 0.7
     assert saved_results == results
 
+    trainer = Trainer(
+        max_epochs=10,
+        plugins=[DeepSpeedPlugin(stage=3)],
+        default_root_dir=tmpdir,
+        gpus=2,
+        precision=16,
+        accumulate_grad_batches=2,
+        callbacks=[ck],
+        resume_from_checkpoint=ck.best_model_path
+    )
+    results = trainer.test(model, datamodule=dm)
+    assert results[0]['test_acc'] > 0.7
+
+    dm.predict_dataloader = dm.test_dataloader
+    results = trainer.predict(model, datamodule=dm)
+    assert results[0]['test_acc'] > 0.7
+
 
 @RunIf(min_gpus=2, deepspeed=True)
 @pytest.mark.parametrize('cpu_offload', [True, False])

From 1de2bcd8da73df727d417d0a3d71dc4adf0728b3 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Fri, 26 Mar 2021 10:23:35 +0000
Subject: [PATCH 25/60] move save_checkpoint responsability to accelerator

---
 pytorch_lightning/accelerators/accelerator.py |  14 ++-
 .../plugins/training_type/deepspeed.py        |  10 +-
 .../plugins/training_type/tpu_spawn.py        |  11 +-
 .../training_type/training_type_plugin.py     |  28 ++++-
 .../connectors/checkpoint_connector.py        | 107 +-----------------
 pytorch_lightning/utilities/cloud_io.py       | 103 +++++++++++++++++
 6 files changed, 157 insertions(+), 116 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index d4cabd088b26b..4b5b6ede2f10b 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -17,7 +17,7 @@
 import torch
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
-
+import pytorch_lightning as pl
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.plugins.training_type import TrainingTypePlugin
@@ -479,3 +479,15 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None:
             ' It will be removed in v1.5.'
         )
         self.setup_precision_plugin(plugin)
+
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None:
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            filepath: write-target file's path
+            weights_only: saving model weights only
+        """
+        self.training_type_plugin.save_checkpoint(trainer, filepath, weights_only)
+
+
+
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 12f2ae93e925c..89282aac20e39 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -33,6 +33,8 @@
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
+from pytorch_lightning.utilities.cloud_io import dump_checkpoint
+
 
 if _DEEPSPEED_AVAILABLE:
     import deepspeed
@@ -231,7 +233,6 @@ def _load_config(self, config):
 
     def pre_dispatch(self):
         self.init_deepspeed()
-        self.lightning_module.trainer.save_checkpoint = self.save_checkpoint
         self.barrier()
 
     def init_deepspeed(self):
@@ -455,7 +456,7 @@ def _filepath_to_dir(self, filepath: str):
     def deepspeed_engine(self):
         return self.model
 
-    def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
@@ -465,13 +466,14 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None:
         if torch.distributed.get_world_size() > 1:
             # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
             # dump states as a checkpoint dictionary object
-            client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
+            client_state = dump_checkpoint(trainer, weights_only)
             save_dir = self._filepath_to_dir(filepath)
             _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers']
             client_state = {k: v for k, v in client_state.items() if k not in _exclude_keys}
             self.deepspeed_engine.save_checkpoint(save_dir, client_state=client_state)
+
         else:
-            self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath)
+            super().save_checkpoint(trainer, filepath, weights_only)
 
     def restore_model_state_from_ckpt_path(self,
                                            ckpt_path: str,
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index a8706d54cb5c9..d8d2267547877 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -18,7 +18,7 @@
 
 import torch
 import torch.multiprocessing as mp
-
+import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle
@@ -27,6 +27,9 @@
 from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
+from pytorch_lightning.utilities.cloud_io import dump_checkpoint
+
+
 
 if _TPU_AVAILABLE:
     import torch_xla.core.xla_model as xm
@@ -106,8 +109,6 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None:
         trainer.accelerator.setup_optimizers(trainer)
         trainer.precision_plugin.connect(self._model, None, None)
 
-        # replace trainer save_checkpoint to use `xm.save`
-        trainer.save_checkpoint = self.save_checkpoint
         self.barrier("pre-run-stage")
 
         results = trainer.run_stage()
@@ -298,7 +299,7 @@ def test_step(self, *args, **kwargs):
     def predict_step(self, *args, **kwargs):
         return self.lightning_module.predict_step(*args, **kwargs)
 
-    def save_checkpoint(self, filepath, weights_only: bool = False):
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
@@ -306,6 +307,6 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
             weights_only: saving model weights only
         """
         # dump states as a checkpoint dictionary object
-        _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only)
+        _checkpoint = dump_checkpoint(trainer, weights_only)
         # Todo: TypeError: 'mappingproxy' object does not support item assignment
         self.save({k: v for k, v in _checkpoint.items() if k != "callbacks"}, filepath)
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 086be8446857f..e871e9135b320 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -14,6 +14,8 @@
 import contextlib
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union, Generator
+from pytorch_lightning.utilities.cloud_io import atomic_save
+from pytorch_lightning.utilities import rank_zero_warn
 
 import torch
 from torch.nn import Module
@@ -23,7 +25,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.plugins.base_plugin import Plugin
-from pytorch_lightning.utilities.cloud_io import load as pl_load
+from pytorch_lightning.utilities.cloud_io import load as pl_load, dump_checkpoint
 
 if TYPE_CHECKING:
     from pytorch_lightning.trainer.trainer import Trainer
@@ -221,3 +223,27 @@ def model_parallel_context(self) -> Generator:
         Returns: Model parallel context.
         """
         yield
+
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None:
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            filepath: write-target file's path
+            weights_only: saving model weights only
+        """
+        # dump states as a checkpoint dictionary object
+        checkpoint = dump_checkpoint(trainer, weights_only)
+        if trainer.is_global_zero:
+            # write the checkpoint dictionary on the file
+
+            checkpoint = self.on_save(checkpoint)
+            try:
+                atomic_save(checkpoint, filepath)
+            except AttributeError as err:
+                if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:
+                    del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY]
+                rank_zero_warn(
+                    'Warning, `hyper_parameters` dropped from checkpoint.'
+                    f' An attribute is not picklable {err}'
+                )
+                atomic_save(checkpoint, filepath)
\ No newline at end of file
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 09f9d1515f58f..d79863a86fc77 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -19,7 +19,7 @@
 
 import torch
 
-import pytorch_lightning
+import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
@@ -236,94 +236,6 @@ def hpc_save(self, folderpath: str, logger):
 
         return filepath
 
-    def dump_checkpoint(self, weights_only: bool = False) -> dict:
-        """Creating a model checkpoint dictionary object from various component states.
-
-        Args:
-            weights_only: saving model weights only
-
-        Return:
-            structured dictionary: {
-                'epoch':                     training epoch
-                'global_step':               training global step
-                'pytorch-lightning_version': PyTorch Lightning's version
-                'callbacks':                 "callback specific state"[] # if not weights_only
-                'optimizer_states':          "PT optim's state_dict"[]   # if not weights_only
-                'lr_schedulers':             "PT sched's state_dict"[]   # if not weights_only
-                'native_amp_scaling_state':  PT amp's state_dict         # if not weights_only and use native amp
-                'amp_scaling_state':         Apex's state_dict           # if not weights_only and use apex amp
-                'state_dict':                Model's state_dict (e.g. network weights)
-                CHECKPOINT_HYPER_PARAMS_NAME:
-                CHECKPOINT_HYPER_PARAMS_KEY:
-                CHECKPOINT_HYPER_PARAMS_TYPE:
-                something_cool_i_want_to_save: anything you define through model.on_save_checkpoint
-                LightningDataModule.__class__.__name__: pl DataModule's state
-            }
-        """
-
-        # dump epoch/global_step/pytorch-lightning_version
-        current_epoch = self.trainer.current_epoch
-        global_step = self.trainer.global_step
-        has_reached_max_steps = self.trainer.max_steps and self.trainer.max_steps <= global_step
-
-        global_step += 1
-        if not has_reached_max_steps:
-            current_epoch += 1
-
-        model = self.trainer.lightning_module
-
-        checkpoint = {
-            'epoch': current_epoch,
-            'global_step': global_step,
-            'pytorch-lightning_version': pytorch_lightning.__version__,
-            'state_dict': model.state_dict(),
-        }
-
-        if not weights_only:
-            # dump callbacks
-            checkpoint['callbacks'] = self.trainer.on_save_checkpoint(checkpoint)
-
-            optimizer_states = []
-            for i, optimizer in enumerate(self.trainer.optimizers):
-                # Rely on accelerator to dump optimizer state
-                optimizer_state = self.trainer.accelerator.optimizer_state(optimizer)
-                optimizer_states.append(optimizer_state)
-
-            checkpoint['optimizer_states'] = optimizer_states
-
-            # dump lr schedulers
-            lr_schedulers = []
-            for scheduler in self.trainer.lr_schedulers:
-                lr_schedulers.append(scheduler['scheduler'].state_dict())
-            checkpoint['lr_schedulers'] = lr_schedulers
-
-            # dump amp scaling
-            if (
-                self.trainer.amp_backend == AMPType.NATIVE and self.trainer._device_type != DeviceType.TPU
-                and self.trainer.scaler is not None
-            ):
-                checkpoint['native_amp_scaling_state'] = self.trainer.scaler.state_dict()
-            elif self.trainer.amp_backend == AMPType.APEX:
-                checkpoint['amp_scaling_state'] = amp.state_dict()
-
-        # dump hyper-parameters
-        if model.hparams:
-            if hasattr(model, '_hparams_name'):
-                checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name
-            # dump arguments
-            if _OMEGACONF_AVAILABLE and isinstance(model.hparams, Container):
-                checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams
-                checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams)
-            else:
-                checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams)
-
-        # give the model a chance to dump a few things
-        model.on_save_checkpoint(checkpoint)
-        if self.trainer.datamodule is not None:
-            self.trainer.datamodule.on_save_checkpoint(checkpoint)
-
-        return checkpoint
-
     def hpc_load(self, checkpoint_path: str, on_gpu: bool):
         """
         Load model/training states from a 'PyTorch-Lightning checkpoint' file for hpc.
@@ -394,19 +306,4 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
             weights_only: saving model weights only
         """
         # dump states as a checkpoint dictionary object
-        checkpoint = self.dump_checkpoint(weights_only)
-        if self.trainer.is_global_zero:
-            # write the checkpoint dictionary on the file
-
-            if self.trainer.training_type_plugin:
-                checkpoint = self.trainer.training_type_plugin.on_save(checkpoint)
-            try:
-                atomic_save(checkpoint, filepath)
-            except AttributeError as err:
-                if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:
-                    del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY]
-                rank_zero_warn(
-                    'Warning, `hyper_parameters` dropped from checkpoint.'
-                    f' An attribute is not picklable {err}'
-                )
-                atomic_save(checkpoint, filepath)
+        self.trainer.accelerator.save_checkpoint(self.trainer, filepath, weights_only)
\ No newline at end of file
diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py
index e94934020107d..febaefd24cf2e 100644
--- a/pytorch_lightning/utilities/cloud_io.py
+++ b/pytorch_lightning/utilities/cloud_io.py
@@ -19,6 +19,19 @@
 
 import fsspec
 import torch
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import (
+    _APEX_AVAILABLE,
+    _OMEGACONF_AVAILABLE,
+    AMPType,
+    DeviceType,
+)
+
+if _APEX_AVAILABLE:
+    from apex import amp
+
+if _OMEGACONF_AVAILABLE:
+    from omegaconf import Container
 
 
 def load(path_or_url: Union[str, IO, Path], map_location=None):
@@ -63,3 +76,93 @@ def atomic_save(checkpoint, filepath: str):
         torch.save(checkpoint, bytesbuffer)
     with fsspec.open(filepath, "wb") as f:
         f.write(bytesbuffer.getvalue())
+
+
+def dump_checkpoint(trainer: 'pl.Trainer', weights_only: bool = False) -> dict:
+    """Creating a model checkpoint dictionary object from various component states.
+
+    Args:
+        weights_only: saving model weights only
+
+    Return:
+        structured dictionary: {
+            'epoch':                     training epoch
+            'global_step':               training global step
+            'pytorch-lightning_version': PyTorch Lightning's version
+            'callbacks':                 "callback specific state"[] # if not weights_only
+            'optimizer_states':          "PT optim's state_dict"[]   # if not weights_only
+            'lr_schedulers':             "PT sched's state_dict"[]   # if not weights_only
+            'native_amp_scaling_state':  PT amp's state_dict         # if not weights_only and use native amp
+            'amp_scaling_state':         Apex's state_dict           # if not weights_only and use apex amp
+            'state_dict':                Model's state_dict (e.g. network weights)
+            CHECKPOINT_HYPER_PARAMS_NAME:
+            CHECKPOINT_HYPER_PARAMS_KEY:
+            CHECKPOINT_HYPER_PARAMS_TYPE:
+            something_cool_i_want_to_save: anything you define through model.on_save_checkpoint
+            LightningDataModule.__class__.__name__: pl DataModule's state
+        }
+    """
+    from pytorch_lightning import LightningModule
+
+    # dump epoch/global_step/pytorch-lightning_version
+    current_epoch = trainer.current_epoch
+    global_step = trainer.global_step
+    has_reached_max_steps = trainer.max_steps and trainer.max_steps <= global_step
+
+    global_step += 1
+    if not has_reached_max_steps:
+        current_epoch += 1
+
+    model = trainer.lightning_module
+
+    checkpoint = {
+        'epoch': current_epoch,
+        'global_step': global_step,
+        'pytorch-lightning_version': pl.__version__,
+        'state_dict': model.state_dict(),
+    }
+
+    if not weights_only:
+        # dump callbacks
+        checkpoint['callbacks'] = trainer.on_save_checkpoint(checkpoint)
+
+        optimizer_states = []
+        for i, optimizer in enumerate(trainer.optimizers):
+            # Rely on accelerator to dump optimizer state
+            optimizer_state = trainer.accelerator.optimizer_state(optimizer)
+            optimizer_states.append(optimizer_state)
+
+        checkpoint['optimizer_states'] = optimizer_states
+
+        # dump lr schedulers
+        lr_schedulers = []
+        for scheduler in trainer.lr_schedulers:
+            lr_schedulers.append(scheduler['scheduler'].state_dict())
+        checkpoint['lr_schedulers'] = lr_schedulers
+
+        # dump amp scaling
+        if (
+            trainer.amp_backend == AMPType.NATIVE and trainer._device_type != DeviceType.TPU
+            and trainer.scaler is not None
+        ):
+            checkpoint['native_amp_scaling_state'] = trainer.scaler.state_dict()
+        elif trainer.amp_backend == AMPType.APEX:
+            checkpoint['amp_scaling_state'] = amp.state_dict()
+
+    # dump hyper-parameters
+    if model.hparams:
+        if hasattr(model, '_hparams_name'):
+            checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name
+        # dump arguments
+        if _OMEGACONF_AVAILABLE and isinstance(model.hparams, Container):
+            checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams
+            checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams)
+        else:
+            checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams)
+
+    # give the model a chance to dump a few things
+    model.on_save_checkpoint(checkpoint)
+    if trainer.datamodule is not None:
+        trainer.datamodule.on_save_checkpoint(checkpoint)
+
+    return checkpoint

From 90d6e03ce13a4845f70ce728576bafa6434cb2cf Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Fri, 26 Mar 2021 10:28:17 +0000
Subject: [PATCH 26/60] resolve flake8

---
 pytorch_lightning/accelerators/accelerator.py |  8 +++-----
 .../plugins/training_type/deepspeed.py        | 11 +++++------
 .../plugins/training_type/tpu_spawn.py        |  5 ++---
 .../training_type/training_type_plugin.py     | 13 +++++++------
 .../connectors/checkpoint_connector.py        | 19 ++++---------------
 pytorch_lightning/trainer/trainer.py          |  1 -
 pytorch_lightning/utilities/imports.py        |  2 ++
 7 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 4b5b6ede2f10b..a7e6f20652576 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union, Generator
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union
 
 import torch
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
+
 import pytorch_lightning as pl
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
@@ -480,7 +481,7 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None:
         )
         self.setup_precision_plugin(plugin)
 
-    def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None:
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
@@ -488,6 +489,3 @@ def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: boo
             weights_only: saving model weights only
         """
         self.training_type_plugin.save_checkpoint(trainer, filepath, weights_only)
-
-
-
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 89282aac20e39..c67c0c7e6096f 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from collections import OrderedDict
 import json
 import logging
 import os
+from collections import OrderedDict
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import torch
 
+import pytorch_lightning as pl
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
@@ -30,15 +31,13 @@
 from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import apply_to_collection
+from pytorch_lightning.utilities.cloud_io import dump_checkpoint
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
-from pytorch_lightning.utilities.cloud_io import dump_checkpoint
-
 
 if _DEEPSPEED_AVAILABLE:
     import deepspeed
-    # from deepspeed.runtime.zero.stage3 import remove_module_hooks
 
 
 def remove_module_hooks(model: torch.nn.Module) -> None:
@@ -298,7 +297,7 @@ def model_parallel_context(self) -> Generator:
             model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True)
         else:
             model_parallel_context = super().model_parallel_context()
-        
+
         with model_parallel_context:
             yield
 
@@ -456,7 +455,7 @@ def _filepath_to_dir(self, filepath: str):
     def deepspeed_engine(self):
         return self.model
 
-    def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None:
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index d8d2267547877..4f9f3427c5d7c 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -18,18 +18,17 @@
 
 import torch
 import torch.multiprocessing as mp
+
 import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities.cloud_io import dump_checkpoint
 from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
-from pytorch_lightning.utilities.cloud_io import dump_checkpoint
-
-
 
 if _TPU_AVAILABLE:
     import torch_xla.core.xla_model as xm
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index e871e9135b320..0bcc240f14fad 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -13,19 +13,20 @@
 # limitations under the License.
 import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union, Generator
-from pytorch_lightning.utilities.cloud_io import atomic_save
-from pytorch_lightning.utilities import rank_zero_warn
+from typing import Any, Callable, Dict, Generator, Iterable, Optional, Tuple, TYPE_CHECKING, Union
 
 import torch
 from torch.nn import Module
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
+import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.plugins.base_plugin import Plugin
-from pytorch_lightning.utilities.cloud_io import load as pl_load, dump_checkpoint
+from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.cloud_io import atomic_save, dump_checkpoint
+from pytorch_lightning.utilities.cloud_io import load as pl_load
 
 if TYPE_CHECKING:
     from pytorch_lightning.trainer.trainer import Trainer
@@ -224,7 +225,7 @@ def model_parallel_context(self) -> Generator:
         """
         yield
 
-    def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None:
+    def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
 
         Args:
@@ -246,4 +247,4 @@ def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: boo
                     'Warning, `hyper_parameters` dropped from checkpoint.'
                     f' An attribute is not picklable {err}'
                 )
-                atomic_save(checkpoint, filepath)
\ No newline at end of file
+                atomic_save(checkpoint, filepath)
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index d79863a86fc77..286585c168782 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -19,17 +19,9 @@
 
 import torch
 
-import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities import (
-    _APEX_AVAILABLE,
-    _OMEGACONF_AVAILABLE,
-    AMPType,
-    DeviceType,
-    rank_zero_info,
-    rank_zero_warn,
-)
-from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
+from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType, DeviceType, rank_zero_info, rank_zero_warn
+from pytorch_lightning.utilities.cloud_io import atomic_save, dump_checkpoint, get_filesystem
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
@@ -37,9 +29,6 @@
 if _APEX_AVAILABLE:
     from apex import amp
 
-if _OMEGACONF_AVAILABLE:
-    from omegaconf import Container
-
 
 class CheckpointConnector:
 
@@ -215,7 +204,7 @@ def hpc_save(self, folderpath: str, logger):
 
         # give model a chance to do something on hpc_save
         model = self.trainer.lightning_module
-        checkpoint = self.dump_checkpoint()
+        checkpoint = dump_checkpoint(self.trainer)
 
         model.on_hpc_save(checkpoint)
 
@@ -306,4 +295,4 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
             weights_only: saving model weights only
         """
         # dump states as a checkpoint dictionary object
-        self.trainer.accelerator.save_checkpoint(self.trainer, filepath, weights_only)
\ No newline at end of file
+        self.trainer.accelerator.save_checkpoint(self.trainer, filepath, weights_only)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 95b01aba1ecec..be584637d40d7 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -57,7 +57,6 @@
 from pytorch_lightning.trainer.training_loop import TrainLoop
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
 from pytorch_lightning.tuner.tuning import Tuner
-from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities import DeviceType, rank_zero_warn
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 87a503e5106a8..bf940e693d5e0 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -62,6 +62,7 @@ def _compare_version(package: str, op, version) -> bool:
         return True
     return op(pkg_version, LooseVersion(version))
 
+
 def _is_kineto_available() -> bool:
     _KINETO_AVAILABLE = False
     if _TORCH_GREATER_EQUAL_1_8:
@@ -71,6 +72,7 @@ def _is_kineto_available() -> bool:
             _KINETO_AVAILABLE = kineto_available_fx()
     return _KINETO_AVAILABLE
 
+
 _IS_WINDOWS = platform.system() == "Windows"
 _IS_INTERACTIVE = hasattr(sys, "ps1")  # https://stackoverflow.com/a/64523765
 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0")

From b6361b8e0fea65b89bf3903837d899e76bd9876d Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Fri, 26 Mar 2021 15:02:30 +0000
Subject: [PATCH 27/60] add BC

---
 pytorch_lightning/trainer/connectors/checkpoint_connector.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index d79863a86fc77..baad964c3797d 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -29,7 +29,7 @@
     rank_zero_info,
     rank_zero_warn,
 )
-from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
+from pytorch_lightning.utilities.cloud_io import atomic_save, dump_checkpoint, get_filesystem
 from pytorch_lightning.utilities.cloud_io import load as pl_load
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
@@ -298,6 +298,9 @@ def get_max_ckpt_path_from_folder(self, folder_path: Union[str, Path]) -> str:
         ckpt_number = max_suffix if max_suffix is not None else 0
         return f'{folder_path}/hpc_ckpt_{ckpt_number}.ckpt'
 
+    def dump_checkpoint(self, weights_only: bool = False) -> dict:
+        return dump_checkpoint(self.trainer, weights_only)
+
     def save_checkpoint(self, filepath, weights_only: bool = False):
         """Save model/training states as a checkpoint file through state-dump and file-write.
 

From 6acaccb395a578f8f60caab45d059d47ad5d110e Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 29 Mar 2021 10:49:02 +0100
Subject: [PATCH 28/60] Change recommended scale to 16

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index c67c0c7e6096f..7f85716f65ac8 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -95,7 +95,7 @@ def __init__(
         parallel_devices: Optional[List[torch.device]] = None,
         cluster_environment: Optional[ClusterEnvironment] = None,
         loss_scale: float = 0,
-        initial_scale_power: int = 32,
+        initial_scale_power: int = 16,
         loss_scale_window: int = 1000,
         hysteresis: int = 2,
         min_loss_scale: int = 1,

From 68b8a43a6f959244e21cc25f21f448bec1f5a2af Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 09:05:45 +0100
Subject: [PATCH 29/60] resolve flake8

---
 pytorch_lightning/accelerators/accelerator.py                | 1 -
 pytorch_lightning/plugins/training_type/deepspeed.py         | 1 -
 pytorch_lightning/plugins/training_type/tpu_spawn.py         | 1 -
 .../plugins/training_type/training_type_plugin.py            | 1 -
 pytorch_lightning/trainer/trainer.py                         | 5 ++++-
 pytorch_lightning/utilities/cloud_io.py                      | 1 -
 6 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index e947f7831efba..7d16d91e3bf82 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -18,7 +18,6 @@
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
-import pytorch_lightning as pl
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin
 from pytorch_lightning.plugins.training_type import TrainingTypePlugin
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 249a11ddb7ff7..59eff80b8c358 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -22,7 +22,6 @@
 
 import torch
 
-import pytorch_lightning as pl
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index 659937bbdfa48..ba074e7cfb206 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -19,7 +19,6 @@
 import torch
 import torch.multiprocessing as mp
 
-import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
 from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 9023e29852084..44de046b57108 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -20,7 +20,6 @@
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader
 
-import pytorch_lightning as pl
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.overrides.base import unwrap_lightning_module
 from pytorch_lightning.plugins.base_plugin import Plugin
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index d2582f76d0633..d565f0906e59e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1086,7 +1086,10 @@ def call_setup_hook(self, model: LightningModule) -> None:
     def call_configure_sharded_model(self, model: LightningModule) -> None:
         # Call configure sharded model hook if accelerator requests. In some cases
         # we will not call the hook; the hook has initialized the sharded model for example.
-        if self.accelerator.call_configure_sharded_model_hook and not getattr(model, "call_configure_sharded_model_hook", False):
+
+        # used on the model if the user re-create a trainer with resume_from_checkpoint
+        model_call_configure_sharded_model_hook = getattr(model, "call_configure_sharded_model_hook", False)
+        if self.accelerator.call_configure_sharded_model_hook and not model_call_configure_sharded_model_hook:
             with self.accelerator.model_sharded_context():
                 model.configure_sharded_model()
                 self.configure_sharded_model(model)
diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py
index 82e02cba42e27..e94934020107d 100644
--- a/pytorch_lightning/utilities/cloud_io.py
+++ b/pytorch_lightning/utilities/cloud_io.py
@@ -63,4 +63,3 @@ def atomic_save(checkpoint, filepath: str):
         torch.save(checkpoint, bytesbuffer)
     with fsspec.open(filepath, "wb") as f:
         f.write(bytesbuffer.getvalue())
-

From a7dcb7b88137982552f82010949b6b9f482c0d56 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 08:23:32 +0000
Subject: [PATCH 30/60] update test

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index ee93ca59eca76..9c3ee6ceeef5b 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1475,7 +1475,7 @@ def test_trainer_predict_dp(tmpdir, num_gpus):
     predict(tmpdir, "dp", num_gpus, None)
 
 
-@RunIf(min_gpus=2, special=True)
+@RunIf(min_gpus=2, special=True, fairscale=True)
 def test_trainer_predict_ddp(tmpdir):
     predict(tmpdir, "ddp", 2, None, plugins=["ddp_sharded"])
 

From 6b08478ca9b7ead0f4cf899c9b9ed7f012b4ef47 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 08:26:39 +0000
Subject: [PATCH 31/60] update install

---
 requirements/extra.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/extra.txt b/requirements/extra.txt
index 715916c4e36ac..cf18020b5714b 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -9,3 +9,4 @@ onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
 https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
+git+https://github.com/microsoft/DeepSpeed.git

From 45a49c5b1f28ea7d24a440648c0080b208bd2047 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 08:50:23 +0000
Subject: [PATCH 32/60] update

---
 .github/workflows/ci_test-full.yml | 4 ++++
 requirements/extra.txt             | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index ec9e71c5b83b2..24e1561540fe4 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -117,6 +117,7 @@ jobs:
         # pip uninstall -y horovod
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)"
 
+
     - name: Install dependencies
       env:
         # MAKEFLAGS: "-j2"
@@ -132,6 +133,9 @@ jobs:
         python ./requirements/adjust_versions.py requirements/extra.txt
         python ./requirements/adjust_versions.py requirements/examples.txt
         pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
+        
+        # install latest version of DeepSpeed - temporary solution until next release
+        pip install git+https://github.com/microsoft/DeepSpeed.git
         pip list
       shell: bash
 
diff --git a/requirements/extra.txt b/requirements/extra.txt
index cf18020b5714b..d9fc96bce1eeb 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -8,5 +8,4 @@ torchtext>=0.5
 onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
-https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
-git+https://github.com/microsoft/DeepSpeed.git
+https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
\ No newline at end of file

From a8da29954fda963fc290ac77970ad22c9ab83a7e Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 09:13:52 +0000
Subject: [PATCH 33/60] update test

---
 azure-pipelines.yml                    |  2 ++
 tests/plugins/test_deepspeed_plugin.py | 14 +++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 85664bac74b67..2491f21fee285 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,6 +63,8 @@ jobs:
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
         pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
+        # install latest version of DeepSpeed - temporary solution until next release
+        pip install git+https://github.com/microsoft/DeepSpeed.git
         pip list
       displayName: 'Install dependencies'
 
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 95192d44a008f..a515cbbfb813f 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,6 +1,7 @@
 import json
 import os
 from typing import Any
+from unittest.mock import call
 
 import pytest
 import torch
@@ -9,6 +10,7 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
+from pytorch_lightning import callbacks
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.metrics import Accuracy
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
@@ -324,24 +326,26 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir):
     assert checkpoint_config['synchronize_checkpoint_boundary']
 
 
-@RunIf(min_gpus=1, deepspeed=True)
+#@RunIf(min_gpus=1, deepspeed=True)
 def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
     """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config."""
 
     deepspeed_zero_config['zero_optimization']['cpu_offload'] = False
 
-    class TestModel(BoringModel):
+    class TestCallback(Callback):
 
-        def on_train_start(self) -> None:
-            assert self.trainer.training_type_plugin.config['zero_optimization']['cpu_offload'] is False
+        def on_before_accelerator_backend_setup(self, trainer, pl_module) -> None:
+            assert trainer.training_type_plugin.config['zero_optimization']['cpu_offload'] is False
             raise SystemExit()
 
-    model = TestModel()
+    model = BoringModel()
     trainer = Trainer(
+        max_epochs=1,
         plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)],
         precision=16,
         gpus=1,
         default_root_dir=tmpdir,
+        callbacks=[TestCallback()]
     )
     with pytest.raises(SystemExit):
         trainer.fit(model)

From 99f1d960aa46f8945fd076c0e59bf45641e56e4d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 10:15:33 +0100
Subject: [PATCH 34/60] update

---
 tests/plugins/test_deepspeed_plugin.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index a515cbbfb813f..d3a7784600690 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,7 +1,6 @@
 import json
 import os
 from typing import Any
-from unittest.mock import call
 
 import pytest
 import torch
@@ -10,7 +9,6 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
-from pytorch_lightning import callbacks
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.metrics import Accuracy
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
@@ -326,7 +324,7 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir):
     assert checkpoint_config['synchronize_checkpoint_boundary']
 
 
-#@RunIf(min_gpus=1, deepspeed=True)
+@RunIf(min_gpus=1, deepspeed=True)
 def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
     """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config."""
 

From 89601d85ed19ffe4f849af101c4f7699040a212e Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 09:17:45 +0000
Subject: [PATCH 35/60] update

---
 .github/workflows/ci_test-full.yml | 3 ---
 azure-pipelines.yml                | 1 -
 requirements/extra.txt             | 3 ++-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 24e1561540fe4..0af812a0a172f 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -133,9 +133,6 @@ jobs:
         python ./requirements/adjust_versions.py requirements/extra.txt
         python ./requirements/adjust_versions.py requirements/examples.txt
         pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
-        
-        # install latest version of DeepSpeed - temporary solution until next release
-        pip install git+https://github.com/microsoft/DeepSpeed.git
         pip list
       shell: bash
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 2491f21fee285..43953cc6ffaf6 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -64,7 +64,6 @@ jobs:
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
         pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
         # install latest version of DeepSpeed - temporary solution until next release
-        pip install git+https://github.com/microsoft/DeepSpeed.git
         pip list
       displayName: 'Install dependencies'
 
diff --git a/requirements/extra.txt b/requirements/extra.txt
index d9fc96bce1eeb..1175466a3df5e 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -8,4 +8,5 @@ torchtext>=0.5
 onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
-https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
\ No newline at end of file
+https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
+deepspeed
\ No newline at end of file

From 389c60b9705d57218e7355ca33fcf3f8422166f3 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 11:26:06 +0000
Subject: [PATCH 36/60] update test

---
 pytorch_lightning/accelerators/accelerator.py |  3 +
 .../plugins/training_type/deepspeed.py        | 24 +++---
 .../training_type/training_type_plugin.py     | 18 ++---
 pytorch_lightning/trainer/training_loop.py    |  3 +-
 pytorch_lightning/utilities/imports.py        | 12 +--
 tests/plugins/test_deepspeed_plugin.py        | 77 ++++++++++---------
 tests/special_tests.sh                        | 18 ++++-
 7 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 7d16d91e3bf82..37725e1c77f4b 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -511,3 +511,6 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
         Returns: If True, delay setup optimizers till pre_dispatch, else call within setup.
         """
         return self.training_type_plugin.setup_optimizers_in_pre_dispatch
+
+    def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
+        return self.training_type_plugin.compute_new_global_step(total_batch_idx, current_global_step)
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 59eff80b8c358..a3fde3e448729 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -290,11 +290,11 @@ def _initialize_deepspeed_train(self, model):
         self.model = model
 
     @contextlib.contextmanager
-    def model_parallel_context(self) -> Generator:
+    def model_sharded_context(self) -> Generator:
         if self.zero_stage_3:
             model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True)
         else:
-            model_parallel_context = super().model_parallel_context()
+            model_parallel_context = super().model_sharded_context()
 
         with model_parallel_context:
             yield
@@ -333,7 +333,6 @@ def _initialize_deepspeed_inference(self, model):
         # Remove all module hooks before initializing new model
         remove_module_hooks(model)
         model, _, _, _ = deepspeed.initialize(
-            args=SimpleNamespace(local_rank=self.local_rank),
             model=model,
             optimizer=optimizer,
             lr_scheduler=lightning_scheduler,
@@ -460,7 +459,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
             filepath: write-target file's path
             weights_only: saving model weights only
         """
-        if torch.distributed.get_world_size() > 1:
+        if torch.distributed.get_world_size() > 1 and self.zero_stage_3:
             # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
             # dump states as a checkpoint dictionary object
             save_dir = self._filepath_to_dir(filepath)
@@ -476,7 +475,6 @@ def restore_model_state_from_ckpt_path(self,
                                            map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
         if torch.distributed.get_world_size() > 1:
             from pytorch_lightning.trainer.states import TrainerState
-            print("restore_model_state_from_ckpt_path")
             stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)
 
@@ -497,14 +495,14 @@ def restore_model_state_from_ckpt_path(self,
             return client_state, False
         return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location)
 
-    def _accumulated_batches_reached(self, trainer):
-        return trainer.total_batch_idx % trainer.accumulate_grad_batches == 0
+    def _accumulated_batches_reached(self, total_batch_idx: int) -> bool:
+        return total_batch_idx % self._original_accumulate_grad_batches == 0
 
-    def increment_accumulated_grad_global_step(self, trainer):
+    def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
         if self._original_accumulate_grad_batches is None:
-            trainer.global_step += 1
+            return current_global_step + 1
         else:
-            trainer.accumulate_grad_batches = self._original_accumulate_grad_batches
-            if self._accumulated_batches_reached(trainer):
-                trainer.global_step += 1
-            trainer.accumulate_grad_batches = 1
+            if self._accumulated_batches_reached(total_batch_idx, ):
+                current_global_step += 1
+            return current_global_step
+
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 44de046b57108..7c686ef140a05 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -223,19 +223,17 @@ def restore_model_state_from_ckpt_path(self,
         self.lightning_module.load_state_dict(ckpt['state_dict'])
         return ckpt, True
 
-    def increment_accumulated_grad_global_step(self, trainer) -> None:
-        trainer.global_step += 1
-
-    @contextlib.contextmanager
-    def model_parallel_context(self) -> Generator:
+    def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
         """
-        Provide hook to create modules in a parallel aware context. This is useful for when we'd like to
-        shard the model instantly, which is useful for extremely large models which can save memory and
-        initialization time.
+        Provide a hook to count optimizer step calls.
 
-        Returns: Model parallel context.
+        Args:
+            total_batch_idx: Total number of batches seen for training
+            current_global_step: Current number of optimizer step calls 
+
+        Returns: New optimizer step calls
         """
-        yield
+        return current_global_step + 1
 
     def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: str) -> None:
         """Save model/training states as a checkpoint file through state-dump and file-write.
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 9d02b64f2b386..5f69d1a4828e1 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -771,7 +771,8 @@ def increment_accumulated_grad_global_step(self):
 
         # progress global step according to grads progress
         if num_accumulated_batches_reached or num_training_batches_reached:
-            self.trainer.training_type_plugin.increment_accumulated_grad_global_step(self.trainer)
+            self.trainer.global_step = self.trainer.accelerator.compute_new_global_step(
+                self.trainer.total_batch_idx, self.trainer.global_step)
 
     def _accumulated_batches_reached(self):
         return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index bf940e693d5e0..5c4de60263aef 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -63,23 +63,13 @@ def _compare_version(package: str, op, version) -> bool:
     return op(pkg_version, LooseVersion(version))
 
 
-def _is_kineto_available() -> bool:
-    _KINETO_AVAILABLE = False
-    if _TORCH_GREATER_EQUAL_1_8:
-        # kineto isn't available into pre 1.8 release
-        kineto_available_fx = getattr(torch.profiler, "kineto_available", None)
-        if kineto_available_fx:
-            _KINETO_AVAILABLE = kineto_available_fx()
-    return _KINETO_AVAILABLE
-
-
 _IS_WINDOWS = platform.system() == "Windows"
 _IS_INTERACTIVE = hasattr(sys, "ps1")  # https://stackoverflow.com/a/64523765
 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0")
 _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
 _TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
 _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0")
-_KINETO_AVAILABLE = _is_kineto_available()
+_KINETO_AVAILABLE = torch.profiler.kineto_available() if _TORCH_GREATER_EQUAL_1_8 else False
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
 _DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _module_available('deepspeed')
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index d3a7784600690..5833120e1a40c 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,6 +1,7 @@
 import json
 import os
 from typing import Any
+from unittest.mock import call
 
 import pytest
 import torch
@@ -9,6 +10,7 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
+from pytorch_lightning import callbacks
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.metrics import Accuracy
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
@@ -19,6 +21,16 @@
 from tests.helpers.runif import RunIf
 
 
+class ModelParallelBoringModel(BoringModel):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = None
+
+    def configure_sharded_model(self) -> None:
+        self.linear = torch.nn.Linear(32, 2)
+
+
 def test_deepspeed_lightning_module(tmpdir):
     """
     Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.
@@ -185,7 +197,7 @@ def test_deepspeed_defaults(tmpdir):
     assert isinstance(plugin.config["zero_optimization"], dict)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True)
 def test_invalid_deepspeed_defaults_no_precision(tmpdir):
     """Test to ensure that using defaults, if precision is not set to 16, we throw an exception."""
     model = BoringModel()
@@ -228,24 +240,25 @@ def test_deepspeed_run_configure_optimizers(tmpdir):
     whilst using configure_optimizers for optimizers and schedulers.
     """
 
-    class TestModel(BoringModel):
+    class TestCB(Callback):
 
-        def on_train_start(self) -> None:
+        def on_train_start(self, trainer, pl_module) -> None:
             from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 
-            assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
-            assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD)
-            assert self.trainer.lr_schedulers == []  # DeepSpeed manages LR scheduler internally
+            assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
+            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
+            assert trainer.lr_schedulers == []  # DeepSpeed manages LR scheduler internally
             # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
-            assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR)
+            assert isinstance(trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR)
 
-    model = TestModel()
+    model = BoringModel()
     trainer = Trainer(
         plugins=DeepSpeedPlugin(),  # disable ZeRO so our optimizers are not wrapped
         default_root_dir=tmpdir,
         gpus=1,
         fast_dev_run=True,
         precision=16,
+        callbacks=[TestCB()]
     )
 
     trainer.fit(model)
@@ -260,25 +273,26 @@ def test_deepspeed_config(tmpdir, deepspeed_zero_config):
     and saves the model weights to load correctly.
     """
 
-    class TestModel(BoringModel):
+    class TestCB(Callback):
 
-        def on_train_start(self) -> None:
+        def on_train_start(self, trainer, pl_module) -> None:
             from deepspeed.runtime.lr_schedules import WarmupLR
             from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer
 
-            assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
-            assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD)
-            assert self.trainer.lr_schedulers == []  # DeepSpeed manages LR scheduler internally
+            assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer)
+            assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD)
+            assert trainer.lr_schedulers == []  # DeepSpeed manages LR scheduler internally
             # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler
-            assert isinstance(self.trainer.model.lr_scheduler, WarmupLR)
+            assert isinstance(trainer.model.lr_scheduler, WarmupLR)
 
-    model = TestModel()
+    model = BoringModel()
     trainer = Trainer(
         plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)],
         default_root_dir=tmpdir,
         gpus=1,
         fast_dev_run=True,
         precision=16,
+        callbacks=[TestCB()]
     )
 
     trainer.fit(model)
@@ -291,19 +305,19 @@ def on_train_start(self) -> None:
 def test_deepspeed_custom_precision_params(tmpdir):
     """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes."""
 
-    class TestModel(BoringModel):
+    class TestCB(Callback):
 
-        def on_train_start(self) -> None:
-            assert self.trainer.training_type_plugin.config['fp16']['loss_scale'] == 10
-            assert self.trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10
-            assert self.trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10
-            assert self.trainer.training_type_plugin.config['fp16']['hysteresis'] == 10
-            assert self.trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10
+        def on_train_start(self, trainer, pl_module) -> None:
+            assert trainer.training_type_plugin.config['fp16']['loss_scale'] == 10
+            assert trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10
+            assert trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10
+            assert trainer.training_type_plugin.config['fp16']['hysteresis'] == 10
+            assert trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10
             raise SystemExit()
 
-    model = TestModel()
+    model = BoringModel()
     ds = DeepSpeedPlugin(loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10)
-    trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, gpus=1)
+    trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, amp_backend='native', gpus=1, callbacks=[TestCB()])
     with pytest.raises(SystemExit):
         trainer.fit(model)
 
@@ -356,7 +370,7 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config):
     """
     model = BoringModel()
     trainer = Trainer(
-        plugins=[DeepSpeedPlugin()],
+        plugins=[DeepSpeedPlugin(zero_optimization=False, stage=2)],
         default_root_dir=tmpdir,
         gpus=2,
         fast_dev_run=True,
@@ -368,16 +382,6 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config):
     _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
-class ModelParallelBoringModel(BoringModel):
-
-    def __init__(self):
-        super().__init__()
-        self.linear = None
-
-    def configure_sharded_model(self) -> None:
-        self.linear = torch.nn.Linear(32, 2)
-
-
 class ModelParallelClassificationModel(LightningModule):
 
     def __init__(self, lr: float = 0.01, num_blocks: int = 5):
@@ -454,7 +458,8 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
     trainer.fit(model)
     trainer.test(model)
 
-    _assert_save_model_is_equal(model, tmpdir, trainer)
+    # todo (tchaton) Currently load_from_checkpoint is not support for zero-v3
+    # _assert_save_model_is_equal(model, tmpdir, trainer)
 
 
 @RunIf(min_gpus=2, deepspeed=True, special=True)
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index aa5d65844a1c5..cc35cca97b114 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -32,6 +32,9 @@ linenos_arr=($linenos)
 blocklist='test_pytorch_profiler_nested_emit_nvtx'
 report=''
 
+# replace debuggin token by anything to filter failing test. Reset to True at when committing.
+DEBUGGING_TOKEN=""
+
 for i in "${!files_arr[@]}"; do
   file=${files_arr[$i]}
   lineno=${linenos_arr[$i]}
@@ -52,16 +55,23 @@ for i in "${!files_arr[@]}"; do
         break
       fi
 
-      # run the test
-      report+="Ran\t$file:$lineno::$test_name\n"
-      python ${defaults} "${file}::${test_name}"
-      break
+      if [[ $line == *$DEBUGGING_TOKEN* ]]; then
+        # run the test
+        report+="Ran\t$file:$lineno::$test_name\n"
+        python ${defaults} "${file}::${test_name}"
+        break
+      fi
     fi
   done < <(echo "$test_code")
 done
 
 nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
 
+if [[ -n "${DEBUGGING_TOKEN}" ]]; 
+  echo "DEBUGGING_TOKEN: $DEBUGGING_TOKEN should be empty"
+  then exit 1
+fi
+
 # echo test report
 printf '=%.s' {1..80}
 printf "\n$report"

From de5f358a3de6337ff49d533f3b89d6578578a322 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 12:29:16 +0100
Subject: [PATCH 37/60] resolve flake8

---
 pytorch_lightning/plugins/training_type/deepspeed.py          | 2 --
 .../plugins/training_type/training_type_plugin.py             | 2 +-
 tests/plugins/test_deepspeed_plugin.py                        | 4 +---
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index a3fde3e448729..3b1b35d822844 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -17,7 +17,6 @@
 import os
 from collections import OrderedDict
 from pathlib import Path
-from types import SimpleNamespace
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import torch
@@ -505,4 +504,3 @@ def compute_new_global_step(self, total_batch_idx: int, current_global_step: int
             if self._accumulated_batches_reached(total_batch_idx, ):
                 current_global_step += 1
             return current_global_step
-
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 7c686ef140a05..22b8f43c28cd4 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -229,7 +229,7 @@ def compute_new_global_step(self, total_batch_idx: int, current_global_step: int
 
         Args:
             total_batch_idx: Total number of batches seen for training
-            current_global_step: Current number of optimizer step calls 
+            current_global_step: Current number of optimizer step calls
 
         Returns: New optimizer step calls
         """
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 5833120e1a40c..b2637a114a09c 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,7 +1,6 @@
 import json
 import os
 from typing import Any
-from unittest.mock import call
 
 import pytest
 import torch
@@ -10,7 +9,6 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning import LightningModule, seed_everything, Trainer
-from pytorch_lightning import callbacks
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.metrics import Accuracy
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
@@ -317,7 +315,7 @@ def on_train_start(self, trainer, pl_module) -> None:
 
     model = BoringModel()
     ds = DeepSpeedPlugin(loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10)
-    trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, amp_backend='native', gpus=1, callbacks=[TestCB()])
+    trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, gpus=1, callbacks=[TestCB()])
     with pytest.raises(SystemExit):
         trainer.fit(model)
 

From 301b1aaeb6d139f1028abef1d79c36db4cbbd338 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 12:35:04 +0100
Subject: [PATCH 38/60] update

---
 .github/workflows/ci_test-full.yml | 1 -
 azure-pipelines.yml                | 1 -
 requirements/extra.txt             | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index 0af812a0a172f..ec9e71c5b83b2 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -117,7 +117,6 @@ jobs:
         # pip uninstall -y horovod
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)"
 
-
     - name: Install dependencies
       env:
         # MAKEFLAGS: "-j2"
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 43953cc6ffaf6..85664bac74b67 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,7 +63,6 @@ jobs:
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
         pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
-        # install latest version of DeepSpeed - temporary solution until next release
         pip list
       displayName: 'Install dependencies'
 
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 1175466a3df5e..cd6c5fc94fe6f 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -9,4 +9,4 @@ onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
 https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
-deepspeed
\ No newline at end of file
+deepspeed==0.3.13

From b9542ae0aa85dd1d4e6e728b2689b88b55a5a3c2 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 13:33:16 +0100
Subject: [PATCH 39/60] update

---
 tests/special_tests.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index cc35cca97b114..b82c90c8decfb 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -67,9 +67,10 @@ done
 
 nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
 
-if [[ -n "${DEBUGGING_TOKEN}" ]]; 
+if [[ -n ${DEBUGGING_TOKEN} ]];
+then
   echo "DEBUGGING_TOKEN: $DEBUGGING_TOKEN should be empty"
-  then exit 1
+  exit 1
 fi
 
 # echo test report

From 48c09505ab981cd2b0a484ed03aafd766e7ccccc Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 13:37:41 +0100
Subject: [PATCH 40/60] update on comments

---
 dockers/base-cuda/Dockerfile                          |  2 +-
 pytorch_lightning/accelerators/accelerator.py         |  6 +++---
 pytorch_lightning/plugins/training_type/deepspeed.py  | 11 ++++-------
 .../plugins/training_type/training_type_plugin.py     |  2 +-
 pytorch_lightning/trainer/training_loop.py            |  5 +++--
 requirements/extra.txt                                |  2 +-
 6 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index d7c13e7560010..476ef75319b4b 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -114,7 +114,7 @@ RUN \
     rm -rf apex
 
 RUN \
-    pip install deepspeed==0.3.13
+    pip install deepspeed>=0.3.13
 
 RUN \
     # Show what we have
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 37725e1c77f4b..569af875e6c64 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -441,7 +441,7 @@ def results(self) -> Any:
         return self.training_type_plugin.results
 
     @contextlib.contextmanager
-    def model_sharded_context(self) -> Generator:
+    def model_sharded_context(self) -> Generator[None, None, None]:
         """
         Provide hook to create modules in a distributed aware context. This is useful for when we'd like to
         shard the model instantly - useful for extremely large models. Can save memory and
@@ -512,5 +512,5 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
         """
         return self.training_type_plugin.setup_optimizers_in_pre_dispatch
 
-    def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
-        return self.training_type_plugin.compute_new_global_step(total_batch_idx, current_global_step)
+    def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
+        return self.training_type_plugin.update_global_step(total_batch_idx, current_global_step)
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 3b1b35d822844..80a12fe181745 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -289,7 +289,7 @@ def _initialize_deepspeed_train(self, model):
         self.model = model
 
     @contextlib.contextmanager
-    def model_sharded_context(self) -> Generator:
+    def model_sharded_context(self) -> Generator[None, None, None]:
         if self.zero_stage_3:
             model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True)
         else:
@@ -494,13 +494,10 @@ def restore_model_state_from_ckpt_path(self,
             return client_state, False
         return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location)
 
-    def _accumulated_batches_reached(self, total_batch_idx: int) -> bool:
-        return total_batch_idx % self._original_accumulate_grad_batches == 0
-
-    def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
+    def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
         if self._original_accumulate_grad_batches is None:
-            return current_global_step + 1
+            return super().update_global_step(total_batch_idx, current_global_step)
         else:
-            if self._accumulated_batches_reached(total_batch_idx, ):
+            if total_batch_idx % self._original_accumulate_grad_batches == 0:
                 current_global_step += 1
             return current_global_step
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 22b8f43c28cd4..d155ce018c77d 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -223,7 +223,7 @@ def restore_model_state_from_ckpt_path(self,
         self.lightning_module.load_state_dict(ckpt['state_dict'])
         return ckpt, True
 
-    def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
+    def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int:
         """
         Provide a hook to count optimizer step calls.
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 5f69d1a4828e1..4640343710f81 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -771,8 +771,9 @@ def increment_accumulated_grad_global_step(self):
 
         # progress global step according to grads progress
         if num_accumulated_batches_reached or num_training_batches_reached:
-            self.trainer.global_step = self.trainer.accelerator.compute_new_global_step(
-                self.trainer.total_batch_idx, self.trainer.global_step)
+            self.trainer.global_step = self.trainer.accelerator.update_global_step(
+                self.trainer.total_batch_idx, self.trainer.global_step
+            )
 
     def _accumulated_batches_reached(self):
         return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0
diff --git a/requirements/extra.txt b/requirements/extra.txt
index cd6c5fc94fe6f..cee1fd0eb07e1 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -9,4 +9,4 @@ onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
 https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
-deepspeed==0.3.13
+deepspeed>=0.3.13

From c2304071900669052b347a8be86a49da548866b7 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 30 Mar 2021 13:42:14 +0100
Subject: [PATCH 41/60] Push

---
 .github/workflows/events-nightly.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 5ad4396a006f7..91d509f193339 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -3,6 +3,7 @@ name: Nightly events
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.community/t/distinct-job-for-each-schedule/17811/2
 on:
+  push: {} # fixme
   schedule:
     - cron: "0 0 * * *" # At the end of every day
 

From 783265f338ee7d757ac17e451090244537ef1c9e Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 30 Mar 2021 13:42:34 +0100
Subject: [PATCH 42/60] pull

---
 .github/workflows/events-nightly.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 91d509f193339..5ad4396a006f7 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -3,7 +3,6 @@ name: Nightly events
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.community/t/distinct-job-for-each-schedule/17811/2
 on:
-  push: {} # fixme
   schedule:
     - cron: "0 0 * * *" # At the end of every day
 

From c8f79f97f43543f26aa9c04eb56d6c8f6db02d29 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 13:56:43 +0100
Subject: [PATCH 43/60] Update
 pytorch_lightning/plugins/training_type/deepspeed.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 pytorch_lightning/plugins/training_type/deepspeed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 80a12fe181745..06622a41193f7 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -166,7 +166,7 @@ def __init__(
 
             partition_activations: Enables partition activation when used with ZeRO stage 3.
                 Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint.
-                See https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional
+                See `deepspeed tutorial <https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional>`_
 
             cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled
 

From 61378debabeee499b4f73f8b370b71252e604846 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 13:56:53 +0100
Subject: [PATCH 44/60] Update
 pytorch_lightning/plugins/training_type/deepspeed.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 pytorch_lightning/plugins/training_type/deepspeed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 06622a41193f7..566461acdd4fd 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -173,7 +173,7 @@ def __init__(
             contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory.
                 Not supported by all models
 
-            synchronize_checkpoint_boundary: Insert ``torch.cuda.synchronize()`` at each checkpoint boundary.
+            synchronize_checkpoint_boundary: Insert :func:`torch.cuda.synchronize` at each checkpoint boundary.
         """
         if not _DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(

From 45c9569d58cb2c1518d161e76163aee086108885 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 14:01:54 +0100
Subject: [PATCH 45/60] update

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 566461acdd4fd..78ab9af3dc139 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -166,7 +166,8 @@ def __init__(
 
             partition_activations: Enables partition activation when used with ZeRO stage 3.
                 Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint.
-                See `deepspeed tutorial <https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional>`_
+                See `deepspeed tutorial
+                <https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional>`_
 
             cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled
 

From deb2ea265e53ba47e800bddf35d6db42e2a55525 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Tue, 30 Mar 2021 14:05:26 +0100
Subject: [PATCH 46/60] Apply suggestions from code review

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 78ab9af3dc139..4dc3a7a4dc26c 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -38,6 +38,7 @@
 
 
 def remove_module_hooks(model: torch.nn.Module) -> None:
+    # TODO: awaiting this feature to move upstream to DeepSpeed
     for module in model.modules():
         module._backward_hooks = OrderedDict()
         module._is_full_backward_hook = None
@@ -320,7 +321,7 @@ def _initialize_deepspeed_inference(self, model):
             )
             optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         inference_config = {
-            # todo: this is required for DeepSpeed throughput timers
+            # todo: this is required for DeepSpeed throughput timers, so throughput timers will be incorrect
             'train_micro_batch_size_per_gpu': 1,
         }
         if 'fp16' in self.config:

From 122e91109f928f1868ef700b668b9b6e8bf093f7 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 30 Mar 2021 14:07:27 +0100
Subject: [PATCH 47/60] Swap to using world size defined by plugin

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 4dc3a7a4dc26c..9d3b9650c5120 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -460,7 +460,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
             filepath: write-target file's path
             weights_only: saving model weights only
         """
-        if torch.distributed.get_world_size() > 1 and self.zero_stage_3:
+        if self.world_size > 1 and self.zero_stage_3:
             # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
             # dump states as a checkpoint dictionary object
             save_dir = self._filepath_to_dir(filepath)
@@ -474,7 +474,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
     def restore_model_state_from_ckpt_path(self,
                                            ckpt_path: str,
                                            map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
-        if torch.distributed.get_world_size() > 1:
+        if self.world_size > 1:
             from pytorch_lightning.trainer.states import TrainerState
             stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING
             save_dir = self._filepath_to_dir(ckpt_path)

From dfb403b3d481472021b2f4c7214fe7ff08402a52 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 14:10:03 +0100
Subject: [PATCH 48/60] update

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 78ab9af3dc139..7df3213067491 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -37,6 +37,7 @@
     import deepspeed
 
 
+# todo (tchaton) use deepspeed version when merged
 def remove_module_hooks(model: torch.nn.Module) -> None:
     for module in model.modules():
         module._backward_hooks = OrderedDict()
@@ -320,7 +321,7 @@ def _initialize_deepspeed_inference(self, model):
             )
             optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         inference_config = {
-            # todo: this is required for DeepSpeed throughput timers
+            # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect
             'train_micro_batch_size_per_gpu': 1,
         }
         if 'fp16' in self.config:
@@ -369,8 +370,15 @@ def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Calla
         self.model.step(**kwargs)
 
     def _handle_gradient_accumulation_steps(self):
+        """
+        This functions overrides the trainer.accumulation_scheduler to generate
+        ``accumulate_grad_batches=1``.
+        Therefore, ``optimizer_step`` will be called on every batches seen
+        so DeepSpeed Engine handles the gradient accumulation logic internally.
+        """
         if self.config.get("gradient_accumulation_steps") > 1:
             self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches
+            # todo (tchaton) Add support for accumulate_grad_batches being a dictionary.
             self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1})
         else:
             self._original_accumulate_grad_batches = None

From 066e0f05e74336ea7b0b641bafca198d576e6183 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 14:11:49 +0100
Subject: [PATCH 49/60] update todo

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index abba793842c85..f73a94474bdbd 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -37,9 +37,8 @@
     import deepspeed
 
 
-# todo (tchaton) use deepspeed version when merged
 def remove_module_hooks(model: torch.nn.Module) -> None:
-    # TODO: awaiting this feature to move upstream to DeepSpeed
+    # todo (tchaton) awaiting this feature to move upstream to DeepSpeed
     for module in model.modules():
         module._backward_hooks = OrderedDict()
         module._is_full_backward_hook = None

From d41284e6178d1a4f5793cdea5d2fc19b31aa98c9 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 30 Mar 2021 14:17:14 +0100
Subject: [PATCH 50/60] Remove deepspeed from extra, keep it in the base cuda
 docker install

---
 requirements/extra.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements/extra.txt b/requirements/extra.txt
index cee1fd0eb07e1..715916c4e36ac 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -9,4 +9,3 @@ onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
 https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
-deepspeed>=0.3.13

From 0c9836cf75d8fec34dca4d18ba520e7604b93bbe Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 30 Mar 2021 14:17:43 +0100
Subject: [PATCH 51/60] Push

---
 .github/workflows/events-nightly.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 5ad4396a006f7..91d509f193339 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -3,6 +3,7 @@ name: Nightly events
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.community/t/distinct-job-for-each-schedule/17811/2
 on:
+  push: {} # fixme
   schedule:
     - cron: "0 0 * * *" # At the end of every day
 

From d1c511ee049c6f26e40c90e631968ff036d13179 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 30 Mar 2021 14:18:01 +0100
Subject: [PATCH 52/60] pull

---
 .github/workflows/events-nightly.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 91d509f193339..5ad4396a006f7 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -3,7 +3,6 @@ name: Nightly events
 # https://jasonet.co/posts/scheduled-actions/
 # https://github.community/t/distinct-job-for-each-schedule/17811/2
 on:
-  push: {} # fixme
   schedule:
     - cron: "0 0 * * *" # At the end of every day
 

From 67d31fa1122f8fa5d781215147556a3b92d0f8b1 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 14:33:17 +0100
Subject: [PATCH 53/60] update

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index f73a94474bdbd..ab1bd3d8b2405 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -17,6 +17,7 @@
 import os
 from collections import OrderedDict
 from pathlib import Path
+from types import SimpleNamespace
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import torch
@@ -277,6 +278,7 @@ def _initialize_deepspeed_train(self, model):
             optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer()
         model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
         model, optimizer, _, lr_scheduler = deepspeed.initialize(
+            args=SimpleNamespace(local_rank=self.local_rank),
             model=model,
             model_parameters=model_parameters,
             optimizer=optimizer,

From 1740eed89b86aad2fb80124085299dc77c4c440f Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 14:46:52 +0100
Subject: [PATCH 54/60] update

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index ab1bd3d8b2405..0f5bb16bab7fd 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -336,6 +336,7 @@ def _initialize_deepspeed_inference(self, model):
         # Remove all module hooks before initializing new model
         remove_module_hooks(model)
         model, _, _, _ = deepspeed.initialize(
+            args=SimpleNamespace(local_rank=self.local_rank),
             model=model,
             optimizer=optimizer,
             lr_scheduler=lightning_scheduler,

From 300f3aa4cb61588beeed714363643efe0d41ad5d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 15:02:01 +0100
Subject: [PATCH 55/60] update

---
 azure-pipelines.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 85664bac74b67..9d3cfdd2ac1ce 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,6 +63,7 @@ jobs:
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
         pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
+        pip install --no-cache-dir git+https://github.com/microsoft/DeepSpeed.git
         pip list
       displayName: 'Install dependencies'
 

From 40b1cc6c2505456d39d5e96bcb7f0108a962c753 Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Tue, 30 Mar 2021 15:29:43 +0000
Subject: [PATCH 56/60] update

---
 azure-pipelines.yml    | 1 -
 requirements/extra.txt | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 9d3cfdd2ac1ce..85664bac74b67 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,7 +63,6 @@ jobs:
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
         pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir
-        pip install --no-cache-dir git+https://github.com/microsoft/DeepSpeed.git
         pip list
       displayName: 'Install dependencies'
 
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 715916c4e36ac..cee1fd0eb07e1 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -9,3 +9,4 @@ onnxruntime>=1.3.0
 hydra-core>=1.0
 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs
 https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip
+deepspeed>=0.3.13

From 603caf13bed6c5ec2e8a1e748ab21c9660f21242 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 30 Mar 2021 17:54:26 +0200
Subject: [PATCH 57/60] Minor changes

---
 .../plugins/training_type/deepspeed.py        |  2 +-
 pytorch_lightning/utilities/imports.py        |  3 ++-
 tests/special_tests.sh                        | 23 ++++++++-----------
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 0f5bb16bab7fd..10cce8d0e0182 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -456,7 +456,7 @@ def _create_default_config(
             }
         return cfg
 
-    def _filepath_to_dir(self, filepath: str):
+    def _filepath_to_dir(self, filepath: str) -> str:
         return os.path.dirname(filepath)
 
     @property
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 5c4de60263aef..001b9a67c5703 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -69,7 +69,7 @@ def _compare_version(package: str, op, version) -> bool:
 _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0")
 _TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0")
 _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0")
-_KINETO_AVAILABLE = torch.profiler.kineto_available() if _TORCH_GREATER_EQUAL_1_8 else False
+
 _APEX_AVAILABLE = _module_available("apex.amp")
 _BOLTS_AVAILABLE = _module_available('pl_bolts')
 _DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _module_available('deepspeed')
@@ -79,6 +79,7 @@ def _compare_version(package: str, op, version) -> bool:
 _HOROVOD_AVAILABLE = _module_available("horovod.torch")
 _HYDRA_AVAILABLE = _module_available("hydra")
 _HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental")
+_KINETO_AVAILABLE = torch.profiler.kineto_available() if _TORCH_GREATER_EQUAL_1_8 else False
 _NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")
 _OMEGACONF_AVAILABLE = _module_available("omegaconf")
 _RPC_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.rpc')
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index b82c90c8decfb..cf81700291b8d 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -32,9 +32,6 @@ linenos_arr=($linenos)
 blocklist='test_pytorch_profiler_nested_emit_nvtx'
 report=''
 
-# replace debuggin token by anything to filter failing test. Reset to True at when committing.
-DEBUGGING_TOKEN=""
-
 for i in "${!files_arr[@]}"; do
   file=${files_arr[$i]}
   lineno=${linenos_arr[$i]}
@@ -55,24 +52,24 @@ for i in "${!files_arr[@]}"; do
         break
       fi
 
-      if [[ $line == *$DEBUGGING_TOKEN* ]]; then
-        # run the test
-        report+="Ran\t$file:$lineno::$test_name\n"
-        python ${defaults} "${file}::${test_name}"
+      # SPECIAL_PATTERN allows filtering the tests to run when debugging.
+      # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those
+      # test with `foo_bar` in their name
+      if [[ $line != *$SPECIAL_PATTERN* ]]; then
+        report+="Skipped\t$file:$lineno::$test_name\n"
         break
       fi
+
+      # run the test
+      report+="Ran\t$file:$lineno::$test_name\n"
+      python ${defaults} "${file}::${test_name}"
+      break
     fi
   done < <(echo "$test_code")
 done
 
 nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx
 
-if [[ -n ${DEBUGGING_TOKEN} ]];
-then
-  echo "DEBUGGING_TOKEN: $DEBUGGING_TOKEN should be empty"
-  exit 1
-fi
-
 # echo test report
 printf '=%.s' {1..80}
 printf "\n$report"

From 62f67e813d8e76685c819f84e9f97319c3772ef6 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 30 Mar 2021 18:01:50 +0200
Subject: [PATCH 58/60] duplicate

---
 dockers/base-cuda/Dockerfile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index 476ef75319b4b..a3624f536a0f2 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -113,9 +113,6 @@ RUN \
     pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
     rm -rf apex
 
-RUN \
-    pip install deepspeed>=0.3.13
-
 RUN \
     # Show what we have
     pip --version && \

From 5786c4b3eaeb3f9a007b5276eb3eb6fb612406dc Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Tue, 30 Mar 2021 18:05:52 +0200
Subject: [PATCH 59/60] format

---
 pytorch_lightning/plugins/precision/double.py |  6 +--
 .../plugins/training_type/deepspeed.py        | 19 ++++++---
 pytorch_lightning/utilities/__init__.py       |  1 +
 tests/models/test_hooks.py                    | 42 +++++--------------
 tests/plugins/test_double_plugin.py           |  6 +--
 5 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/double.py b/pytorch_lightning/plugins/precision/double.py
index 4720f0f874fd0..6e37c79f2b163 100644
--- a/pytorch_lightning/plugins/precision/double.py
+++ b/pytorch_lightning/plugins/precision/double.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import wraps
-from typing import Any, Sequence, Tuple, TYPE_CHECKING, List
+from typing import Any, List, Sequence, Tuple, TYPE_CHECKING
 
 import torch
 
@@ -44,9 +44,7 @@ def _to_double_precision(data: torch.Tensor) -> torch.Tensor:
 
     @staticmethod
     def _move_float_tensors_to_double(collection: Any) -> Any:
-        return apply_to_collection(
-            collection, torch.Tensor, function=_DoublePrecisionPatch._to_double_precision
-        )
+        return apply_to_collection(collection, torch.Tensor, function=_DoublePrecisionPatch._to_double_precision)
 
     @classmethod
     def patch(cls, model: 'Module', method_name: str) -> '_DoublePrecisionPatch':
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 10cce8d0e0182..7011b81a8d131 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -436,9 +436,14 @@ def _format_precision_config(self):
             raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.")
 
     def _create_default_config(
-        self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool,
-        cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool,
-        **zero_kwargs
+        self,
+        zero_optimization: bool,
+        zero_allow_untested_optimizer: bool,
+        partition_activations: bool,
+        cpu_checkpointing: bool,
+        contiguous_memory_optimization: bool,
+        synchronize_checkpoint_boundary: bool,
+        **zero_kwargs,
     ) -> Dict:
         cfg = {
             'activation_checkpointing': {
@@ -481,9 +486,11 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
         else:
             super().save_checkpoint(checkpoint, filepath)
 
-    def restore_model_state_from_ckpt_path(self,
-                                           ckpt_path: str,
-                                           map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
+    def restore_model_state_from_ckpt_path(
+        self,
+        ckpt_path: str,
+        map_location=lambda storage, loc: storage,
+    ) -> Tuple[Dict, bool]:
         if self.world_size > 1:
             from pytorch_lightning.trainer.states import TrainerState
             stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 03981b0042eac..28cb05bc06f2d 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -14,6 +14,7 @@
 """General utilities"""
 
 import numpy
+
 from pytorch_lightning.utilities.apply_func import move_data_to_device  # noqa: F401
 from pytorch_lightning.utilities.distributed import (  # noqa: F401
     AllGatherGrad,
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 1d55d4a5a63b7..57af82ccc3e08 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -19,7 +19,7 @@
 
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.trainer.states import TrainerState
-from tests.helpers import BoringModel, RandomDataset, BoringDataModule
+from tests.helpers import BoringDataModule, BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
 
@@ -515,6 +515,7 @@ def test_trainer_datamodule_hook_system(tmpdir):
     """Test the LightningDataModule hook system."""
 
     class HookedDataModule(BoringDataModule):
+
         def __init__(self):
             super().__init__()
             self.called = []
@@ -574,23 +575,10 @@ def on_after_batch_transfer(self, *args, **kwargs):
     trainer.fit(model, datamodule=dm)
 
     expected = [
-        'prepare_data',
-        'setup_fit',
-        'val_dataloader',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'train_dataloader',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'val_dataloader',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
+        'prepare_data', 'setup_fit', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
+        'on_after_batch_transfer', 'train_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
+        'on_after_batch_transfer', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer',
+        'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer',
         'teardown_fit'
     ]
     assert dm.called == expected
@@ -599,13 +587,8 @@ def on_after_batch_transfer(self, *args, **kwargs):
     trainer.validate(model, datamodule=dm, verbose=False)
 
     expected = [
-        'prepare_data',
-        'setup_validate',
-        'val_dataloader',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'teardown_validate'
+        'prepare_data', 'setup_validate', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
+        'on_after_batch_transfer', 'teardown_validate'
     ]
     assert dm.called == expected
 
@@ -613,12 +596,7 @@ def on_after_batch_transfer(self, *args, **kwargs):
     trainer.test(model, datamodule=dm, verbose=False)
 
     expected = [
-        'prepare_data',
-        'setup_test',
-        'test_dataloader',
-        'on_before_batch_transfer',
-        'transfer_batch_to_device',
-        'on_after_batch_transfer',
-        'teardown_test'
+        'prepare_data', 'setup_test', 'test_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device',
+        'on_after_batch_transfer', 'teardown_test'
     ]
     assert dm.called == expected
diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py
index f089b1c23149e..175ca5ecaba6b 100644
--- a/tests/plugins/test_double_plugin.py
+++ b/tests/plugins/test_double_plugin.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pytest
-
 import torch
 from torch.utils.data import DataLoader, Dataset
 
@@ -107,10 +106,7 @@ def predict_dataloader(self):
         return DataLoader(RandomDataset(32, 64))
 
 
-@pytest.mark.parametrize(
-    'boring_model',
-    (DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward)
-)
+@pytest.mark.parametrize('boring_model', (DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward))
 def test_double_precision(tmpdir, boring_model):
     model = boring_model()
     original_training_step = model.training_step

From 83e1343dc9ebdceb33c626698ac541cddf1cf5e7 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka.borovec@seznam.cz>
Date: Tue, 30 Mar 2021 18:19:04 +0200
Subject: [PATCH 60/60] format2

---
 pytorch_lightning/plugins/training_type/deepspeed.py      | 2 +-
 .../plugins/training_type/training_type_plugin.py         | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 7011b81a8d131..3dc52b60055d8 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -489,7 +489,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
     def restore_model_state_from_ckpt_path(
         self,
         ckpt_path: str,
-        map_location=lambda storage, loc: storage,
+        map_location: Callable = lambda storage, loc: storage,
     ) -> Tuple[Dict, bool]:
         if self.world_size > 1:
             from pytorch_lightning.trainer.states import TrainerState
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index d155ce018c77d..01c23504b7773 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -198,9 +198,11 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
         """
         return False
 
-    def restore_model_state_from_ckpt_path(self,
-                                           ckpt_path: str,
-                                           map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]:
+    def restore_model_state_from_ckpt_path(
+        self,
+        ckpt_path: str,
+        map_location: Callable = lambda storage, loc: storage,
+    ) -> Tuple[Dict, bool]:
         """
         This function is used to load and restore the model state.