From 2d3f617aa5f5d1e33e6da24242c621205b29e149 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 10 Mar 2021 17:45:03 +0000 Subject: [PATCH 01/60] Add context to call hook to handle all modules defined within the hook --- pytorch_lightning/plugins/training_type/deepspeed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index b54155d60eae5..eb9d7aba9773e 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -233,6 +233,9 @@ def _init_scheduler_optimizer(self): def _initialize_deepspeed_train(self, model): if self.on_gpu: torch.cuda.set_device(self.root_device) + with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): + self.lightning_module.trainer.call_hook("on_model_parallel_setup") + optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info( From 99495e8ac113eaeb36930a77da1f72f1d9e3e1bd Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 10 Mar 2021 23:57:35 +0000 Subject: [PATCH 02/60] Expose some additional parameters --- pytorch_lightning/core/hooks.py | 7 +++ .../plugins/training_type/deepspeed.py | 52 ++++++++++++++++--- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 9826f9d44ac2c..86480d8c22598 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -334,6 +334,13 @@ def on_post_move_to_device(self): """ + def on_model_parallel_setup(self) -> None: + """ + + Returns: + + """ + class DataHooks: """Hooks to be used for data related stuff.""" diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index eb9d7aba9773e..cf35ac724d059 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -83,7 +83,12 @@ def __init__( initial_scale_power: int = 32, loss_scale_window: int = 1000, hysteresis: int = 2, - min_loss_scale: int = 1 + min_loss_scale: int = 1, + activation_checkpointing: bool = False, + partition_activations: bool = False, + cpu_checkpointing: bool = False, + contiguous_memory_optimization: bool = False, + synchronize_checkpoint_boundary: bool = False, ) -> None: """ @@ -159,6 +164,11 @@ def __init__( self.config = self._create_default_config( zero_optimization, zero_allow_untested_optimizer, + activation_checkpointing=activation_checkpointing, + partition_activations=partition_activations, + cpu_checkpointing=cpu_checkpointing, + contiguous_memory_optimization=contiguous_memory_optimization, + synchronize_checkpoint_boundary=synchronize_checkpoint_boundary, stage=stage, cpu_offload=cpu_offload, contiguous_gradients=contiguous_gradients, @@ -230,11 +240,17 @@ def _init_scheduler_optimizer(self): optimizer = optimizers[0] return optimizer, scheduler, optimizer_frequencies + @property + def zero_stage_3(self) -> bool: + return self.config.get('zero_optimization') and self.config.get('zero_optimization').get('stage') == 3 + def _initialize_deepspeed_train(self, model): if self.on_gpu: torch.cuda.set_device(self.root_device) - with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): - self.lightning_module.trainer.call_hook("on_model_parallel_setup") + + if self.zero_stage_3: + with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): + self.lightning_module.trainer.call_hook("on_model_parallel_setup") optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: @@ -252,11 +268,22 @@ def _initialize_deepspeed_train(self, model): lr_scheduler=lightning_scheduler, config_params=self.config, ) + self._set_deepspeed_activation_checkpointing() # set optimizer for save/load, but deepspeed manages the specific optimizer logic self.lightning_module.trainer.optimizers = [optimizer] self.model = model + def _set_deepspeed_activation_checkpointing(self): + checkpoint_config = self.config.get('activation_checkpointing', {}) + deepspeed.checkpointing.configure( + mpu_=None, + partition_activations=checkpoint_config.get('partition_activations'), + contiguous_checkpointing=checkpoint_config.get('contiguous_checkpointing'), + checkpoint_in_cpu=checkpoint_config.get('checkpoint_in_cpu'), + profile=checkpoint_config.get('profile'), + ) + def _initialize_deepspeed_inference(self, model): # move the model to the correct device self.model_to_device() @@ -346,8 +373,21 @@ def _format_precision_config(self): raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.") def _create_default_config( - self, zero_optimization: bool, zero_allow_untested_optimizer: bool, **zero_kwargs + self, zero_optimization: bool, zero_allow_untested_optimizer: bool, activation_checkpointing: bool, + partition_activations: bool, cpu_checkpointing: bool, contiguous_memory_optimization: bool, + synchronize_checkpoint_boundary: bool, **zero_kwargs ) -> Dict: + cfg = {} if zero_optimization: - return {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs} - return {} + cfg = {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs} + if activation_checkpointing: + cfg = { + 'activation_checkpointing': { + "partition_activations": partition_activations, + "cpu_checkpointing": cpu_checkpointing, + "contiguous_memory_optimization": contiguous_memory_optimization, + "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary + }, + **cfg + } + return cfg From c3aac675ddf8fbd01a0cbf9f0cd6e2499cd50480 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 11 Mar 2021 13:57:24 +0000 Subject: [PATCH 03/60] Added docs, exposed parameters --- .../plugins/training_type/deepspeed.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index cf35ac724d059..addbc1fb744c9 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -67,6 +67,8 @@ def __init__( zero_optimization: bool = True, stage: int = 2, cpu_offload: bool = False, + cpu_offload_params: bool = False, + cpu_offload_use_pin_memory: bool = False, contiguous_gradients: bool = True, overlap_comm: bool = True, allgather_partitions: bool = True, @@ -111,6 +113,10 @@ def __init__( cpu_offload: Enable offloading optimizer memory and computation to CPU + cpu_offload_params: When using ZeRO stage 3, offload parameters to CPU + + cpu_offload_use_pin_memory: When using ZeRO stage 3, pin memory on CPU + contiguous_gradients: Copies gradients to a continuous buffer as they are produced. Avoids memory fragmentation during backwards. Useful when training large models. (default: True) @@ -149,6 +155,18 @@ def __init__( min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000) + activation_checkpointing: Enable activation checkpointing. This allows DeepSpeed to setup global variables + however still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. + See https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional + + partition_activations: Enables partition activation when used with ZeRO stage 3 + + cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled + + contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory. + Not supported by all models + + synchronize_checkpoint_boundary: Insert ``torch.cuda.synchronize()`` at each checkpoint boundary. """ if not _DEEPSPEED_AVAILABLE: raise MisconfigurationException( @@ -171,6 +189,8 @@ def __init__( synchronize_checkpoint_boundary=synchronize_checkpoint_boundary, stage=stage, cpu_offload=cpu_offload, + cpu_offload_params=cpu_offload_params, + cpu_offload_use_pin_memory=cpu_offload_use_pin_memory, contiguous_gradients=contiguous_gradients, overlap_comm=overlap_comm, allgather_partitions=allgather_partitions, From 340f817406d0f51e052f7ad7884e639c5d8f0a61 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 11 Mar 2021 17:46:39 +0000 Subject: [PATCH 04/60] Make sure we only configure if necessary --- .../plugins/training_type/deepspeed.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index addbc1fb744c9..0e3bc4f16cde8 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -295,14 +295,15 @@ def _initialize_deepspeed_train(self, model): self.model = model def _set_deepspeed_activation_checkpointing(self): - checkpoint_config = self.config.get('activation_checkpointing', {}) - deepspeed.checkpointing.configure( - mpu_=None, - partition_activations=checkpoint_config.get('partition_activations'), - contiguous_checkpointing=checkpoint_config.get('contiguous_checkpointing'), - checkpoint_in_cpu=checkpoint_config.get('checkpoint_in_cpu'), - profile=checkpoint_config.get('profile'), - ) + if self.config.get('activation_checkpointing'): + checkpoint_config = self.config['activation_checkpointing'] + deepspeed.checkpointing.configure( + mpu_=None, + partition_activations=checkpoint_config.get('partition_activations'), + contiguous_checkpointing=checkpoint_config.get('contiguous_checkpointing'), + checkpoint_in_cpu=checkpoint_config.get('checkpoint_in_cpu'), + profile=checkpoint_config.get('profile'), + ) def _initialize_deepspeed_inference(self, model): # move the model to the correct device From f192afc1f6b824ec0339c4c2b68c907bc01d6123 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 12 Mar 2021 00:29:37 +0000 Subject: [PATCH 05/60] Setup activation checkpointing regardless, saves the user having to do it manually --- .../plugins/training_type/deepspeed.py | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 0e3bc4f16cde8..3eb14ff2959e6 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -86,7 +86,6 @@ def __init__( loss_scale_window: int = 1000, hysteresis: int = 2, min_loss_scale: int = 1, - activation_checkpointing: bool = False, partition_activations: bool = False, cpu_checkpointing: bool = False, contiguous_memory_optimization: bool = False, @@ -155,12 +154,10 @@ def __init__( min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000) - activation_checkpointing: Enable activation checkpointing. This allows DeepSpeed to setup global variables - however still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. + partition_activations: Enables partition activation when used with ZeRO stage 3. + Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. See https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional - partition_activations: Enables partition activation when used with ZeRO stage 3 - cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory. @@ -182,7 +179,6 @@ def __init__( self.config = self._create_default_config( zero_optimization, zero_allow_untested_optimizer, - activation_checkpointing=activation_checkpointing, partition_activations=partition_activations, cpu_checkpointing=cpu_checkpointing, contiguous_memory_optimization=contiguous_memory_optimization, @@ -394,21 +390,18 @@ def _format_precision_config(self): raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.") def _create_default_config( - self, zero_optimization: bool, zero_allow_untested_optimizer: bool, activation_checkpointing: bool, - partition_activations: bool, cpu_checkpointing: bool, contiguous_memory_optimization: bool, - synchronize_checkpoint_boundary: bool, **zero_kwargs + self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool, + cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, + **zero_kwargs ) -> Dict: - cfg = {} + cfg = { + 'activation_checkpointing': { + "partition_activations": partition_activations, + "cpu_checkpointing": cpu_checkpointing, + "contiguous_memory_optimization": contiguous_memory_optimization, + "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary + } + } if zero_optimization: cfg = {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs} - if activation_checkpointing: - cfg = { - 'activation_checkpointing': { - "partition_activations": partition_activations, - "cpu_checkpointing": cpu_checkpointing, - "contiguous_memory_optimization": contiguous_memory_optimization, - "synchronize_checkpoint_boundary": synchronize_checkpoint_boundary - }, - **cfg - } return cfg From a2784a479996ba444b603ffe88752fae0ddb917a Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Fri, 12 Mar 2021 23:43:10 +0000 Subject: [PATCH 06/60] Add some tests that fail currently --- .../plugins/training_type/deepspeed.py | 43 +++++++------ tests/plugins/test_deepspeed_plugin.py | 63 ++++++++++++++++++- 2 files changed, 87 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 3eb14ff2959e6..612a92edbc97d 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -20,7 +20,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch -from torch.nn.parallel import DistributedDataParallel from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase @@ -239,6 +238,11 @@ def init_deepspeed(self): precision = self.lightning_module.trainer.accelerator.precision model = LightningDeepSpeedModule(pl_module=self.model, precision=precision) + if self.on_gpu: + torch.cuda.set_device(self.root_device) + + self._call_model_parallel_setup() + if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) else: @@ -261,13 +265,6 @@ def zero_stage_3(self) -> bool: return self.config.get('zero_optimization') and self.config.get('zero_optimization').get('stage') == 3 def _initialize_deepspeed_train(self, model): - if self.on_gpu: - torch.cuda.set_device(self.root_device) - - if self.zero_stage_3: - with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): - self.lightning_module.trainer.call_hook("on_model_parallel_setup") - optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info( @@ -290,6 +287,11 @@ def _initialize_deepspeed_train(self, model): self.lightning_module.trainer.optimizers = [optimizer] self.model = model + def _call_model_parallel_setup(self): + if self.zero_stage_3: + with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): + self.lightning_module.trainer.call_hook("on_model_parallel_setup") + def _set_deepspeed_activation_checkpointing(self): if self.config.get('activation_checkpointing'): checkpoint_config = self.config['activation_checkpointing'] @@ -302,15 +304,16 @@ def _set_deepspeed_activation_checkpointing(self): ) def _initialize_deepspeed_inference(self, model): - # move the model to the correct device - self.model_to_device() - - self.pre_configure_ddp() - self.model = DistributedDataParallel( - model, - device_ids=self.determine_ddp_device_ids(), - **self._ddp_kwargs, + inference_config = { + 'train_micro_batch_size_per_gpu': 1, + 'fp16': self.config['fp16'], + } + model, _, _, _ = deepspeed.initialize( + args=SimpleNamespace(local_rank=self.local_rank), + model=model, + config_params=inference_config, ) + self.model = model def configure_scheduler(self, lr_scheduler): scheduler = _get_default_scheduler_config() @@ -357,7 +360,7 @@ def _format_batch_size_and_grad_accum_config(self): if "train_micro_batch_size_per_gpu" not in self.config: # train_micro_batch_size_per_gpu is used for throughput logging purposes # by default we use the batch size of the loader which may be incorrect if a batch sampler is passed - batch_size = self.lightning_module.train_dataloader().batch_size + batch_size = self.lightning_module.train_dataloader().batch_sampler.batch_size self.config["train_micro_batch_size_per_gpu"] = batch_size self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches if "gradient_clipping" not in self.config: @@ -403,5 +406,9 @@ def _create_default_config( } } if zero_optimization: - cfg = {"zero_allow_untested_optimizer": zero_allow_untested_optimizer, "zero_optimization": zero_kwargs} + cfg = { + "zero_allow_untested_optimizer": zero_allow_untested_optimizer, + "zero_optimization": zero_kwargs, + **cfg + } return cfg diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index cf5c23a824732..81b38d85c9c10 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -301,6 +301,22 @@ def on_train_start(self) -> None: trainer.fit(model) +@RunIf(deepspeed=True) +def test_deepspeed_custom_activation_checkpointing_params(tmpdir): + """Ensure if we modify the activation checkpointing parameters, the deepspeed config contains these changes.""" + ds = DeepSpeedPlugin( + partition_activations=True, + cpu_checkpointing=True, + contiguous_memory_optimization=True, + synchronize_checkpoint_boundary=True + ) + checkpoint_config = ds.config['activation_checkpointing'] + assert checkpoint_config['partition_activations'] + assert checkpoint_config['cpu_checkpointing'] + assert checkpoint_config['contiguous_memory_optimization'] + assert checkpoint_config['synchronize_checkpoint_boundary'] + + @RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.""" @@ -324,7 +340,7 @@ def on_train_start(self) -> None: trainer.fit(model) -@RunIf(min_gpus=2, special=True, deepspeed=True) +@RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu(tmpdir, deepspeed_config): """ Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. @@ -343,6 +359,51 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer) +class ModelParallelBoringModel(BoringModel): + + def __init__(self): + super().__init__() + self.linear = None + + def on_model_parallel_setup(self) -> None: + self.linear = torch.nn.Linear(32, 2) + + +@RunIf(min_gpus=2, deepspeed=True) +def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): + """ + Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. + """ + model = ModelParallelBoringModel() + trainer = Trainer( + plugins=[DeepSpeedPlugin(stage=3)], + default_root_dir=tmpdir, + gpus=2, + fast_dev_run=True, + precision=16, + ) + trainer.fit(model) + trainer.test(model) + + _assert_save_model_is_equal(model, tmpdir, trainer) + + +@RunIf(min_gpus=2, deepspeed=True) +def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): + """ + Test to ensure we can use DeepSpeed with just test. + """ + model = ModelParallelBoringModel() + trainer = Trainer( + plugins=[DeepSpeedPlugin(stage=3)], + default_root_dir=tmpdir, + gpus=2, + fast_dev_run=True, + precision=16, + ) + trainer.test(model) + + def _assert_save_model_is_equal(model, tmpdir, trainer): checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) From b0dab3d3f95b6482a013477ca21f745c327a0da8 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 15 Mar 2021 19:42:24 +0000 Subject: [PATCH 07/60] update --- .../plugins/training_type/deepspeed.py | 29 ++++++++++++++++++- .../training_type/training_type_plugin.py | 12 ++++++++ .../connectors/checkpoint_connector.py | 17 +++++------ pytorch_lightning/trainer/trainer.py | 3 +- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 612a92edbc97d..994d4218ff2fd 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -222,6 +222,8 @@ def pre_dispatch(self): self.init_deepspeed() + self.lightning_module.trainer.save_checkpoint = self.save_checkpoint + # set warning rank rank_zero_only.rank = self.global_rank @@ -367,7 +369,6 @@ def _format_batch_size_and_grad_accum_config(self): self.config["gradient_clipping"] = self.lightning_module.trainer.gradient_clip_val def _format_precision_config(self): - amp_type = self.lightning_module.trainer.accelerator_connector.amp_type amp_level = self.lightning_module.trainer.accelerator_connector.amp_level precision = self.lightning_module.trainer.accelerator_connector.precision @@ -412,3 +413,29 @@ def _create_default_config( **cfg } return cfg + + def _filepath_to_dir(self, filepath: str): + return filepath.split('.')[0] + + def save_checkpoint(self, filepath: str, weights_only: bool = False): + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + filepath: write-target file's path + weights_only: saving model weights only + """ + # dump states as a checkpoint dictionary object + _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) + save_dir = self._filepath_to_dir(filepath) + _exclude_keys = []# ['optimizer_states', 'lr_schedulers'] + _checkpoint = {k:v for k, v in _checkpoint.items() if k not in _exclude_keys} + self.model.save_checkpoint(save_dir, client_state=_checkpoint) + + def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage): + if torch.distributed.is_available(): + from pytorch_lightning.trainer.states import TrainerState + _load_optimization = self.lightning_module.trainer.state == TrainerState.FITTING + save_dir = self._filepath_to_dir(ckpt_path) + self.model.optimizer._partition_all_parameters() + _, client_state = self.model.load_checkpoint(save_dir, load_optimizer_states=_load_optimization, load_lr_scheduler_states=_load_optimization) + return client_state, False \ No newline at end of file diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 7783f066dbc61..603cf64e8d7b5 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union +from pytorch_lightning.utilities.cloud_io import load as pl_load import torch from torch.nn import Module @@ -169,3 +170,14 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule): def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs): optimizer.step(closure=lambda_closure, **kwargs) + + def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Dict: + ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) + # restore datamodule states + if self.lightning_module.trainer.datamodule is not None: + self.lightning_module.trainer.datamodule.on_load_checkpoint(ckpt) + + # hook: give user access to checkpoint if needed. + self.lightning_module.on_load_checkpoint(ckpt) + self.lightning_module.load_state_dict(ckpt['state_dict']) + return ckpt, True diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 60c76b70bba50..987d2dec65f13 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -90,20 +90,16 @@ def restore(self, checkpoint_path: str, on_gpu: bool) -> bool: rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch") return False - # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path` - checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) + checkpoint, load_optimizer_states = self.trainer.training_type_plugin.restore_model_state_from_ckpt_path( + checkpoint_path, map_location=lambda storage, loc: storage) - # acquire the model model = self.trainer.lightning_module - # restore model and datamodule state - self.restore_model_state(model, checkpoint) - if on_gpu: model.cuda(self.trainer.root_gpu) # restore training state - self.restore_training_state(checkpoint) + self.restore_training_state(checkpoint, load_optimizer_states) rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}") return True @@ -123,7 +119,7 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None: # restore model state_dict model.load_state_dict(checkpoint['state_dict']) - def restore_training_state(self, checkpoint): + def restore_training_state(self, checkpoint, load_optimizer_states: bool): """ Restore trainer state. Model will get its change to update @@ -131,7 +127,7 @@ def restore_training_state(self, checkpoint): :return: """ # validation - if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint: + if load_optimizer_states and ('optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint): raise KeyError( 'Trying to restore training state but checkpoint contains only the model.' ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.' @@ -177,6 +173,9 @@ def restore_training_state(self, checkpoint): " consider using an end of epoch checkpoint." ) + if not load_optimizer_states: + return + # restore the optimizers optimizer_states = checkpoint['optimizer_states'] for optimizer, opt_state in zip(self.trainer.optimizers, optimizer_states): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c3039d24aadc0..8ebab2e110aca 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -971,8 +971,7 @@ def __load_ckpt_weights( self.training_type_plugin.barrier() - ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt['state_dict']) + self.training_type_plugin.restore_model_state_from_ckpt_path(ckpt_path, map_location=lambda storage, loc: storage) return ckpt_path def predict( From 0c44f0585a260ae684a47c26b6ebaa8f5afcb200 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 15 Mar 2021 19:58:03 +0000 Subject: [PATCH 08/60] update --- .../plugins/training_type/deepspeed.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 994d4218ff2fd..b15975019db2d 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -425,17 +425,21 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False): weights_only: saving model weights only """ # dump states as a checkpoint dictionary object - _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) + client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) save_dir = self._filepath_to_dir(filepath) - _exclude_keys = []# ['optimizer_states', 'lr_schedulers'] - _checkpoint = {k:v for k, v in _checkpoint.items() if k not in _exclude_keys} - self.model.save_checkpoint(save_dir, client_state=_checkpoint) + _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers'] + client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys} + self.model.save_checkpoint(save_dir, client_state=client_state) def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage): if torch.distributed.is_available(): - from pytorch_lightning.trainer.states import TrainerState - _load_optimization = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) self.model.optimizer._partition_all_parameters() - _, client_state = self.model.load_checkpoint(save_dir, load_optimizer_states=_load_optimization, load_lr_scheduler_states=_load_optimization) + _, client_state = self.model.load_checkpoint(save_dir) + + if self.lightning_module.trainer.datamodule is not None: + self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state) + + # hook: give user access to checkpoint if needed. + self.lightning_module.on_load_checkpoint(client_state) return client_state, False \ No newline at end of file From 26655d7f032b2680b8a35c0b4a159c8224af51a3 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 15 Mar 2021 20:02:39 +0000 Subject: [PATCH 09/60] update --- pytorch_lightning/plugins/training_type/deepspeed.py | 11 ++++++++--- .../plugins/training_type/training_type_plugin.py | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index b15975019db2d..be6df07ba22b3 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -417,7 +417,7 @@ def _create_default_config( def _filepath_to_dir(self, filepath: str): return filepath.split('.')[0] - def save_checkpoint(self, filepath: str, weights_only: bool = False): + def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: @@ -431,12 +431,17 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False): client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys} self.model.save_checkpoint(save_dir, client_state=client_state) - def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage): + def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: if torch.distributed.is_available(): + from pytorch_lightning.trainer.states import TrainerState + load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) self.model.optimizer._partition_all_parameters() - _, client_state = self.model.load_checkpoint(save_dir) + _, client_state = self.model.load_checkpoint( + save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) + + # restore datamodule states if self.lightning_module.trainer.datamodule is not None: self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 603cf64e8d7b5..9e2de5ce61ab0 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union, Tuple from pytorch_lightning.utilities.cloud_io import load as pl_load import torch @@ -171,7 +171,7 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule): def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs): optimizer.step(closure=lambda_closure, **kwargs) - def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Dict: + def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) # restore datamodule states if self.lightning_module.trainer.datamodule is not None: From ac19f369172e179404814b782264b3e553bf90bb Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 16 Mar 2021 10:16:18 +0000 Subject: [PATCH 10/60] add tests --- tests/helpers/simple_models.py | 6 +-- tests/plugins/test_deepspeed_plugin.py | 55 +++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py index 1abeb1f00206a..a35ab2fc4a2fe 100644 --- a/tests/helpers/simple_models.py +++ b/tests/helpers/simple_models.py @@ -54,20 +54,20 @@ def training_step(self, batch, batch_idx): logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log('train_loss', loss, prog_bar=True) - self.log('train_acc', self.train_acc(logits, y), prog_bar=True) + self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('val_acc', self.valid_acc(logits, y), prog_bar=True) + self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('test_acc', self.test_acc(logits, y), prog_bar=True) + self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) class RegressionModel(LightningModule): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 81b38d85c9c10..7a8e226e3ca56 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,16 +1,20 @@ import json import os +from pytorch_lightning.core import datamodule import pytest import torch from torch import Tensor from torch.optim import Optimizer - +from torch import nn +from pytorch_lightning.metrics import Accuracy from pytorch_lightning import Trainer from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel +from tests.helpers.datamodules import ClassifDataModule +from tests.helpers.simple_models import ClassificationModel from tests.helpers.runif import RunIf @@ -369,6 +373,24 @@ def on_model_parallel_setup(self) -> None: self.linear = torch.nn.Linear(32, 2) +class ModelParallelClassificationModel(ClassificationModel): + + def __init__(self, lr=0.01): + super().__init__() + + self.lr = lr + self.train_acc = Accuracy() + self.valid_acc = Accuracy() + self.test_acc = Accuracy() + + def on_model_parallel_setup(self) -> None: + for i in range(3): + setattr(self, f"layer_{i}", nn.Linear(32, 32)) + setattr(self, f"layer_{i}a", nn.ReLU()) + setattr(self, "layer_end", nn.Linear(32, 3)) + + + @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): """ @@ -388,6 +410,37 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer) +@pytest.mark.skipif("Currently failing") +@RunIf(min_gpus=2, deepspeed=True) +def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): + """ + Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. + """ + model = ModelParallelClassificationModel() + dm = ClassifDataModule() + trainer = Trainer( + max_epochs=2, + plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)], + default_root_dir=tmpdir, + gpus=2, + fast_dev_run=True, + precision=16, + ) + trainer.fit(model, datamodule=dm) + + trainer = Trainer( + plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)], + default_root_dir=tmpdir, + gpus=2, + fast_dev_run=True, + precision=16, + resume_from_checkpoint=trainer.checkpoint_callback.best_model_path, + ) + trainer.fit(model, datamodule=dm) + + _assert_save_model_is_equal(model, tmpdir, trainer) + + @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): """ From d273393ece9c207b02f02c38e3eb8ddb71b32605 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 16 Mar 2021 10:39:51 +0000 Subject: [PATCH 11/60] change docstring --- tests/plugins/test_deepspeed_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 7a8e226e3ca56..47ff338063dea 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -414,7 +414,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): """ - Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. + Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint """ model = ModelParallelClassificationModel() dm = ClassifDataModule() From c91d12854eba5b23a27eb042b12240b1a24f3b66 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 16 Mar 2021 18:33:42 +0000 Subject: [PATCH 12/60] resolve accumulate_grad_batches --- .../plugins/training_type/deepspeed.py | 65 +++++++++++++++--- .../training_type/training_type_plugin.py | 3 + .../connectors/checkpoint_connector.py | 2 + pytorch_lightning/trainer/training_loop.py | 6 +- test.json | 1 + tests/helpers/simple_models.py | 7 +- tests/plugins/test_deepspeed_plugin.py | 66 ++++++++++++++++--- 7 files changed, 129 insertions(+), 21 deletions(-) create mode 100644 test.json diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index be6df07ba22b3..338dc7540e8fb 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -18,6 +18,7 @@ from pathlib import Path from types import SimpleNamespace from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from pytorch_lightning.callbacks import GradientAccumulationScheduler import torch @@ -237,6 +238,8 @@ def init_deepspeed(self): self._format_config() self._config_initialized = True + self._handle_gradient_accumulation_steps() + precision = self.lightning_module.trainer.accelerator.precision model = LightningDeepSpeedModule(pl_module=self.model, precision=precision) @@ -287,6 +290,7 @@ def _initialize_deepspeed_train(self, model): # set optimizer for save/load, but deepspeed manages the specific optimizer logic self.lightning_module.trainer.optimizers = [optimizer] + self.lightning_module.trainer.schedulers = [lr_scheduler] self.model = model def _call_model_parallel_setup(self): @@ -306,14 +310,29 @@ def _set_deepspeed_activation_checkpointing(self): ) def _initialize_deepspeed_inference(self, model): + optimizer, lightning_scheduler, optimizer_frequencies = None, None, None + if "optimizer" not in self.config: + rank_zero_info( + "You have not specified an optimizer or scheduler within the DeepSpeed config." + "Using `configure_optimizers` to define optimizer and scheduler." + ) + optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() inference_config = { 'train_micro_batch_size_per_gpu': 1, 'fp16': self.config['fp16'], } + if self.zero_stage_3: + inference_config.update({ + "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'], + "zero_optimization": self.config['zero_optimization'], + }) model, _, _, _ = deepspeed.initialize( args=SimpleNamespace(local_rank=self.local_rank), model=model, + optimizer=optimizer, + lr_scheduler=lightning_scheduler, config_params=inference_config, + model_parameters=[], ) self.model = model @@ -344,6 +363,13 @@ def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Calla # internally, the engine has a reference to the optimizer already. self.model.step(**kwargs) + def _handle_gradient_accumulation_steps(self): + if self.config.get("gradient_accumulation_steps") > 1: + self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1}) + else: + self._original_accumulate_grad_batches = None + def _format_config(self): if self.config is None: raise MisconfigurationException( @@ -424,19 +450,24 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: filepath: write-target file's path weights_only: saving model weights only """ - # dump states as a checkpoint dictionary object - client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) - save_dir = self._filepath_to_dir(filepath) - _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers'] - client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys} - self.model.save_checkpoint(save_dir, client_state=client_state) + if torch.distributed.get_world_size() > 1: + # dump states as a checkpoint dictionary object + client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) + save_dir = self._filepath_to_dir(filepath) + _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers'] + client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys} + self.model.save_checkpoint(save_dir, client_state=client_state) + else: + self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath) def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: - if torch.distributed.is_available(): + if torch.distributed.get_world_size() > 1: from pytorch_lightning.trainer.states import TrainerState load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) - self.model.optimizer._partition_all_parameters() + + if self.zero_stage_3: + self.model.optimizer._partition_all_parameters() _, client_state = self.model.load_checkpoint( save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) @@ -447,4 +478,20 @@ def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda # hook: give user access to checkpoint if needed. self.lightning_module.on_load_checkpoint(client_state) - return client_state, False \ No newline at end of file + return client_state, False + else: + super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location) + return {}, False + + def _accumulated_batches_reached(self, trainer): + return (trainer.total_batch_idx) % trainer.accumulate_grad_batches == 0 + + def increment_accumulated_grad_global_step(self, trainer): + if self._original_accumulate_grad_batches is None: + trainer.global_step += 1 + else: + trainer.accumulate_grad_batches = self._original_accumulate_grad_batches + #print("increment_accumulated_grad_global_step", trainer.total_batch_idx, not self.should_accumulate(trainer), trainer.global_step, self.model.global_steps) + if self._accumulated_batches_reached(trainer): + trainer.global_step += 1 + trainer.accumulate_grad_batches = 1 \ No newline at end of file diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 9e2de5ce61ab0..9588a0fc1b145 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -181,3 +181,6 @@ def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda self.lightning_module.on_load_checkpoint(ckpt) self.lightning_module.load_state_dict(ckpt['state_dict']) return ckpt, True + + def increment_accumulated_grad_global_step(self, trainer) -> None: + trainer.global_step += 1 \ No newline at end of file diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 987d2dec65f13..23dcd01d63feb 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -98,6 +98,8 @@ def restore(self, checkpoint_path: str, on_gpu: bool) -> bool: if on_gpu: model.cuda(self.trainer.root_gpu) + print(checkpoint) + # restore training state self.restore_training_state(checkpoint, load_optimizer_states) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 88b87afcb9358..89e62cc51f543 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -547,6 +547,8 @@ def run_training_epoch(self): if self._num_training_batches_reached(is_last_batch): break + + # progress global step according to grads progress self.increment_accumulated_grad_global_step() @@ -632,6 +634,8 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): opt_idx=opt_idx, ) + self.trainer.training_type_plugin.on_batch + # ------------------------------ # BACKWARD PASS # ------------------------------ @@ -806,7 +810,7 @@ def increment_accumulated_grad_global_step(self): # progress global step according to grads progress if num_accumulated_batches_reached or num_training_batches_reached: - self.trainer.global_step += 1 + self.trainer.training_type_plugin.increment_accumulated_grad_global_step(self.trainer) def _accumulated_batches_reached(self): return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0 diff --git a/test.json b/test.json new file mode 100644 index 0000000000000..d6a14fb12477c --- /dev/null +++ b/test.json @@ -0,0 +1 @@ +{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 3, 'cpu_offload': True, 'cpu_offload_params': False, 'cpu_offload_use_pin_memory': False, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000.0, 'reduce_bucket_size': 200000000.0}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'train_micro_batch_size_per_gpu': 10, 'gradient_accumulation_steps': 1, 'gradient_clipping': 0, 'fp16': {'enabled': True, 'loss_scale': 0, 'initial_scale_power': 32, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}} \ No newline at end of file diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py index a35ab2fc4a2fe..85f1968bee7eb 100644 --- a/tests/helpers/simple_models.py +++ b/tests/helpers/simple_models.py @@ -14,6 +14,7 @@ import torch import torch.nn.functional as F from torch import nn +from torch import distributed as dist from pytorch_lightning import LightningModule from pytorch_lightning.metrics import Accuracy, MeanSquaredError @@ -54,20 +55,20 @@ def training_step(self, batch, batch_idx): logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log('train_loss', loss, prog_bar=True) - self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) + #self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) + # self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) + # self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) class RegressionModel(LightningModule): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 47ff338063dea..b91a178473e5a 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,14 +1,15 @@ import json import os +from pytorch_lightning.callbacks.base import Callback from pytorch_lightning.core import datamodule - +from typing import Any import pytest import torch from torch import Tensor from torch.optim import Optimizer from torch import nn from pytorch_lightning.metrics import Accuracy -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, callbacks from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -16,6 +17,7 @@ from tests.helpers.datamodules import ClassifDataModule from tests.helpers.simple_models import ClassificationModel from tests.helpers.runif import RunIf +from pytorch_lightning import LightningModule def test_deepspeed_lightning_module(tmpdir): @@ -389,6 +391,10 @@ def on_model_parallel_setup(self) -> None: setattr(self, f"layer_{i}a", nn.ReLU()) setattr(self, "layer_end", nn.Linear(32, 3)) + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) + return [optimizer], [lr_scheduler] @RunIf(min_gpus=2, deepspeed=True) @@ -410,7 +416,6 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer) -@pytest.mark.skipif("Currently failing") @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): """ @@ -420,25 +425,70 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): dm = ClassifDataModule() trainer = Trainer( max_epochs=2, - plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)], + plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)], default_root_dir=tmpdir, gpus=2, - fast_dev_run=True, + limit_val_batches=2, + limit_test_batches=2, precision=16, + accumulate_grad_batches=2, ) trainer.fit(model, datamodule=dm) trainer = Trainer( - plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)], + max_epochs=3, + plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)], default_root_dir=tmpdir, gpus=2, - fast_dev_run=True, + limit_val_batches=2, + limit_test_batches=2, precision=16, resume_from_checkpoint=trainer.checkpoint_callback.best_model_path, ) trainer.fit(model, datamodule=dm) + trainer.test(datamodule=dm) - _assert_save_model_is_equal(model, tmpdir, trainer) + +@RunIf(min_gpus=2, deepspeed=True) +def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, deepspeed_config): + """ + Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint + """ + class VerificationCallback(Callback): + + def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + deepspeed_engine = trainer.training_type_plugin.model + assert trainer.global_step == deepspeed_engine.global_steps + + + model = ModelParallelClassificationModel() + dm = ClassifDataModule() + trainer = Trainer( + max_epochs=2, + plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)], + default_root_dir=tmpdir, + gpus=2, + limit_val_batches=2, + limit_test_batches=2, + precision=16, + accumulate_grad_batches=3, + callbacks=[VerificationCallback()] + ) + trainer.fit(model, datamodule=dm) + + trainer = Trainer( + max_epochs=3, + plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)], + default_root_dir=tmpdir, + gpus=2, + limit_val_batches=2, + limit_test_batches=2, + precision=16, + resume_from_checkpoint=trainer.checkpoint_callback.best_model_path, + callbacks=[VerificationCallback()] + ) + trainer.fit(model, datamodule=dm) + trainer.test(datamodule=dm) @RunIf(min_gpus=2, deepspeed=True) From 959d7b7c5e15f4e7ec1fa7913c98c74975e83339 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 16 Mar 2021 18:40:31 +0000 Subject: [PATCH 13/60] resolve flake8 --- .../plugins/training_type/deepspeed.py | 20 ++++++++++-------- .../training_type/training_type_plugin.py | 10 +++++---- pytorch_lightning/trainer/trainer.py | 9 ++++---- pytorch_lightning/trainer/training_loop.py | 2 -- test.json | 1 - tests/helpers/simple_models.py | 9 ++++---- tests/plugins/test_deepspeed_plugin.py | 21 ++++++++++--------- 7 files changed, 36 insertions(+), 36 deletions(-) delete mode 100644 test.json diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 338dc7540e8fb..fc99866cefe49 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -18,10 +18,10 @@ from pathlib import Path from types import SimpleNamespace from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from pytorch_lightning.callbacks import GradientAccumulationScheduler import torch +from pytorch_lightning.callbacks import GradientAccumulationScheduler from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -455,23 +455,26 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) save_dir = self._filepath_to_dir(filepath) _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers'] - client_state = {k:v for k, v in client_state.items() if k not in _exclude_keys} + client_state = {k: v for k, v in client_state.items() if k not in _exclude_keys} self.model.save_checkpoint(save_dir, client_state=client_state) else: self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath) - def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: + def restore_model_state_from_ckpt_path(self, + ckpt_path: str, + map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: if torch.distributed.get_world_size() > 1: from pytorch_lightning.trainer.states import TrainerState load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) - + if self.zero_stage_3: - self.model.optimizer._partition_all_parameters() + self.model.optimizer._partition_all_parameters() _, client_state = self.model.load_checkpoint( - save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states) - + save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states + ) + # restore datamodule states if self.lightning_module.trainer.datamodule is not None: self.lightning_module.trainer.datamodule.on_load_checkpoint(client_state) @@ -491,7 +494,6 @@ def increment_accumulated_grad_global_step(self, trainer): trainer.global_step += 1 else: trainer.accumulate_grad_batches = self._original_accumulate_grad_batches - #print("increment_accumulated_grad_global_step", trainer.total_batch_idx, not self.should_accumulate(trainer), trainer.global_step, self.model.global_steps) if self._accumulated_batches_reached(trainer): trainer.global_step += 1 - trainer.accumulate_grad_batches = 1 \ No newline at end of file + trainer.accumulate_grad_batches = 1 diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 9588a0fc1b145..49c337e936b8a 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -12,8 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union, Tuple -from pytorch_lightning.utilities.cloud_io import load as pl_load +from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union import torch from torch.nn import Module @@ -23,6 +22,7 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.base_plugin import Plugin +from pytorch_lightning.utilities.cloud_io import load as pl_load if TYPE_CHECKING: from pytorch_lightning.trainer.trainer import Trainer @@ -171,7 +171,9 @@ def init_optimizers(self, trainer: "Trainer", model: LightningModule): def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs): optimizer.step(closure=lambda_closure, **kwargs) - def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: + def restore_model_state_from_ckpt_path(self, + ckpt_path: str, + map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage) # restore datamodule states if self.lightning_module.trainer.datamodule is not None: @@ -183,4 +185,4 @@ def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda return ckpt, True def increment_accumulated_grad_global_step(self, trainer) -> None: - trainer.global_step += 1 \ No newline at end of file + trainer.global_step += 1 diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8ebab2e110aca..acdf226cfa89b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -58,7 +58,6 @@ from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.tuner.tuning import Tuner from pytorch_lightning.utilities import rank_zero_warn -from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import recursive_detach @@ -922,9 +921,7 @@ def test( # If you supply a datamodule you can't supply test_dataloaders if test_dataloaders and datamodule: - raise MisconfigurationException( - 'You cannot pass both `trainer.test(test_dataloaders=..., datamodule=...)`' - ) + raise MisconfigurationException('You cannot pass both `trainer.test(test_dataloaders=..., datamodule=...)`') model_provided = model is not None model = model or self.lightning_module @@ -971,7 +968,9 @@ def __load_ckpt_weights( self.training_type_plugin.barrier() - self.training_type_plugin.restore_model_state_from_ckpt_path(ckpt_path, map_location=lambda storage, loc: storage) + self.training_type_plugin.restore_model_state_from_ckpt_path( + ckpt_path, map_location=lambda storage, loc: storage + ) return ckpt_path def predict( diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 89e62cc51f543..66a44704ad19a 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -547,8 +547,6 @@ def run_training_epoch(self): if self._num_training_batches_reached(is_last_batch): break - - # progress global step according to grads progress self.increment_accumulated_grad_global_step() diff --git a/test.json b/test.json deleted file mode 100644 index d6a14fb12477c..0000000000000 --- a/test.json +++ /dev/null @@ -1 +0,0 @@ -{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 3, 'cpu_offload': True, 'cpu_offload_params': False, 'cpu_offload_use_pin_memory': False, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000.0, 'reduce_bucket_size': 200000000.0}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'train_micro_batch_size_per_gpu': 10, 'gradient_accumulation_steps': 1, 'gradient_clipping': 0, 'fp16': {'enabled': True, 'loss_scale': 0, 'initial_scale_power': 32, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}} \ No newline at end of file diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py index 85f1968bee7eb..a1b6c03fd00d8 100644 --- a/tests/helpers/simple_models.py +++ b/tests/helpers/simple_models.py @@ -14,7 +14,6 @@ import torch import torch.nn.functional as F from torch import nn -from torch import distributed as dist from pytorch_lightning import LightningModule from pytorch_lightning.metrics import Accuracy, MeanSquaredError @@ -28,7 +27,7 @@ def __init__(self, lr=0.01): self.lr = lr for i in range(3): setattr(self, f"layer_{i}", nn.Linear(32, 32)) - setattr(self, f"layer_{i}a", torch.nn.ReLU()) + setattr(self, f"layer_{i}a", nn.ReLU()) setattr(self, "layer_end", nn.Linear(32, 3)) self.train_acc = Accuracy() @@ -55,20 +54,20 @@ def training_step(self, batch, batch_idx): logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log('train_loss', loss, prog_bar=True) - #self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) + self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) - # self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) + self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) - # self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) + self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) class RegressionModel(LightningModule): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index b91a178473e5a..125c33ab47f4d 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,23 +1,22 @@ import json import os -from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.core import datamodule from typing import Any + import pytest import torch -from torch import Tensor +from torch import nn, Tensor from torch.optim import Optimizer -from torch import nn -from pytorch_lightning.metrics import Accuracy -from pytorch_lightning import Trainer, callbacks + +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.metrics import Accuracy from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule -from tests.helpers.simple_models import ClassificationModel from tests.helpers.runif import RunIf -from pytorch_lightning import LightningModule +from tests.helpers.simple_models import ClassificationModel def test_deepspeed_lightning_module(tmpdir): @@ -454,13 +453,15 @@ def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, """ Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint """ + class VerificationCallback(Callback): - def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + def on_train_batch_start( + self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int + ) -> None: deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps - model = ModelParallelClassificationModel() dm = ClassifDataModule() trainer = Trainer( From f0cb6e744b1c6e0c40fb615620a2020a72614394 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 17 Mar 2021 10:13:27 +0000 Subject: [PATCH 14/60] Update DeepSpeed to use latest version, add some comments --- dockers/base-cuda/Dockerfile | 4 +--- .../plugins/training_type/deepspeed.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 843e47ca91289..d7c13e7560010 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -114,9 +114,7 @@ RUN \ rm -rf apex RUN \ - # install DeepSpeed from source. - # todo: swap to pypi release once DeepSpeed releases a new version >= 0.3.10 - pip install deepspeed@git+https://github.com/microsoft/DeepSpeed@ec8b1cb + pip install deepspeed==0.3.13 RUN \ # Show what we have diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index fc99866cefe49..d7b4704450e5c 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -279,7 +279,6 @@ def _initialize_deepspeed_train(self, model): optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) model, optimizer, _, lr_scheduler = deepspeed.initialize( - args=SimpleNamespace(local_rank=self.local_rank), model=model, model_parameters=model_parameters, optimizer=optimizer, @@ -310,6 +309,7 @@ def _set_deepspeed_activation_checkpointing(self): ) def _initialize_deepspeed_inference(self, model): + # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info( @@ -318,6 +318,7 @@ def _initialize_deepspeed_inference(self, model): ) optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() inference_config = { + # todo: this is required for DeepSpeed throughput timers 'train_micro_batch_size_per_gpu': 1, 'fp16': self.config['fp16'], } @@ -441,7 +442,11 @@ def _create_default_config( return cfg def _filepath_to_dir(self, filepath: str): - return filepath.split('.')[0] + return os.path.dirname(filepath) + + @property + def deepspeed_engine(self): + return self.model def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. @@ -451,12 +456,13 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: weights_only: saving model weights only """ if torch.distributed.get_world_size() > 1: + # Use deepspeed's internal checkpointing function to handle partitioned weights across processes # dump states as a checkpoint dictionary object client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) save_dir = self._filepath_to_dir(filepath) _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers'] client_state = {k: v for k, v in client_state.items() if k not in _exclude_keys} - self.model.save_checkpoint(save_dir, client_state=client_state) + self.deepspeed_engine.save_checkpoint(save_dir, client_state=client_state) else: self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath) @@ -469,7 +475,8 @@ def restore_model_state_from_ckpt_path(self, save_dir = self._filepath_to_dir(ckpt_path) if self.zero_stage_3: - self.model.optimizer._partition_all_parameters() + # TODO: Currently required as this call is missing within the deepspeed engine. + self.deepspeed_engine.optimizer._partition_all_parameters() _, client_state = self.model.load_checkpoint( save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states @@ -487,7 +494,7 @@ def restore_model_state_from_ckpt_path(self, return {}, False def _accumulated_batches_reached(self, trainer): - return (trainer.total_batch_idx) % trainer.accumulate_grad_batches == 0 + return trainer.total_batch_idx % trainer.accumulate_grad_batches == 0 def increment_accumulated_grad_global_step(self, trainer): if self._original_accumulate_grad_batches is None: From 914de8670062cb362384c77d0db41262b6fcc0fc Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 17 Mar 2021 10:56:40 +0000 Subject: [PATCH 15/60] add metrics --- .../plugins/training_type/deepspeed.py | 7 +- .../connectors/checkpoint_connector.py | 2 +- tests/helpers/pipelines.py | 2 +- tests/helpers/simple_models.py | 6 +- tests/plugins/test_deepspeed_plugin.py | 92 +++++++++++++------ 5 files changed, 74 insertions(+), 35 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 338dc7540e8fb..609e3f3e570aa 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -297,6 +297,8 @@ def _call_model_parallel_setup(self): if self.zero_stage_3: with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): self.lightning_module.trainer.call_hook("on_model_parallel_setup") + else: + self.lightning_module.trainer.call_hook("on_model_parallel_setup") def _set_deepspeed_activation_checkpointing(self): if self.config.get('activation_checkpointing'): @@ -319,8 +321,11 @@ def _initialize_deepspeed_inference(self, model): optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() inference_config = { 'train_micro_batch_size_per_gpu': 1, - 'fp16': self.config['fp16'], } + if 'fp16' in self.config: + inference_config.update({ + "fp16": self.config["fp16"] + }) if self.zero_stage_3: inference_config.update({ "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'], diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 23dcd01d63feb..c68951f9a66c6 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -121,7 +121,7 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None: # restore model state_dict model.load_state_dict(checkpoint['state_dict']) - def restore_training_state(self, checkpoint, load_optimizer_states: bool): + def restore_training_state(self, checkpoint, load_optimizer_states: bool = True): """ Restore trainer state. Model will get its change to update diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py index 403bcdfee8c1d..ebacad05b0a6f 100644 --- a/tests/helpers/pipelines.py +++ b/tests/helpers/pipelines.py @@ -100,7 +100,7 @@ def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50): # run prediction on 1 batch trained_model.cpu() trained_model.eval() - + batch = next(iter(dataloader)) x, y = batch x = x.flatten(1) diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py index 85f1968bee7eb..f71f3df16df33 100644 --- a/tests/helpers/simple_models.py +++ b/tests/helpers/simple_models.py @@ -55,20 +55,20 @@ def training_step(self, batch, batch_idx): logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log('train_loss', loss, prog_bar=True) - #self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) + self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) - # self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) + self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) - # self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) + self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) class RegressionModel(LightningModule): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index b91a178473e5a..fef6b3af9f8dd 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,21 +1,20 @@ import json import os -from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.core import datamodule +from pytorch_lightning.callbacks import Callback, ModelCheckpoint from typing import Any import pytest import torch from torch import Tensor from torch.optim import Optimizer from torch import nn +import torch.nn.functional as F from pytorch_lightning.metrics import Accuracy -from pytorch_lightning import Trainer, callbacks +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.datamodules import ClassifDataModule -from tests.helpers.simple_models import ClassificationModel from tests.helpers.runif import RunIf from pytorch_lightning import LightningModule @@ -375,22 +374,56 @@ def on_model_parallel_setup(self) -> None: self.linear = torch.nn.Linear(32, 2) -class ModelParallelClassificationModel(ClassificationModel): +class ModelParallelClassificationModel(LightningModule): def __init__(self, lr=0.01): super().__init__() - self.lr = lr + self.train_acc = Accuracy() self.valid_acc = Accuracy() self.test_acc = Accuracy() + def make_block(self): + return nn.Sequential(nn.Linear(32, 32, bias=False), nn.BatchNorm1d(32), nn.LeakyReLU()) + def on_model_parallel_setup(self) -> None: for i in range(3): - setattr(self, f"layer_{i}", nn.Linear(32, 32)) - setattr(self, f"layer_{i}a", nn.ReLU()) + setattr(self, f"block_{i}", self.make_block()) setattr(self, "layer_end", nn.Linear(32, 3)) + def forward(self, x): + x = self.block_0(x) + x = self.block_1(x) + x = self.block_2(x) + x = self.layer_end(x) + logits = F.softmax(x, dim=1) + return logits + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) + return [optimizer], [] + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + loss = F.cross_entropy(logits, y) + self.log('train_loss', loss, prog_bar=True) + self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) + self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) + self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) + def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) @@ -421,32 +454,33 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): """ Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint """ - model = ModelParallelClassificationModel() + seed_everything(42) + model = ModelParallelClassificationModel(lr=0.1) dm = ClassifDataModule() + ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( - max_epochs=2, - plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)], + max_epochs=10, + plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)], default_root_dir=tmpdir, gpus=2, - limit_val_batches=2, - limit_test_batches=2, precision=16, accumulate_grad_batches=2, + callbacks=[ck] ) trainer.fit(model, datamodule=dm) - + results = trainer.test(model, datamodule=dm) + print(results) + results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) + print(results) trainer = Trainer( - max_epochs=3, + max_epochs=1, plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)], default_root_dir=tmpdir, gpus=2, - limit_val_batches=2, - limit_test_batches=2, precision=16, - resume_from_checkpoint=trainer.checkpoint_callback.best_model_path, + resume_from_checkpoint=ck.best_model_path, ) trainer.fit(model, datamodule=dm) - trainer.test(datamodule=dm) @RunIf(min_gpus=2, deepspeed=True) @@ -454,27 +488,26 @@ def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, """ Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint """ + seed_everything(42) class VerificationCallback(Callback): def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int) -> None: deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps - - model = ModelParallelClassificationModel() + model = ModelParallelClassificationModel(lr=0.1) dm = ClassifDataModule() trainer = Trainer( - max_epochs=2, - plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)], - default_root_dir=tmpdir, + max_epochs=5, + plugins=[DeepSpeedPlugin(stage=2, zero_optimization=False, cpu_offload=True)], gpus=2, limit_val_batches=2, - limit_test_batches=2, - precision=16, - accumulate_grad_batches=3, + precision=32, + accumulate_grad_batches=2, callbacks=[VerificationCallback()] ) trainer.fit(model, datamodule=dm) + results = trainer.test(datamodule=dm) trainer = Trainer( max_epochs=3, @@ -487,8 +520,9 @@ def on_train_batch_start(self, trainer, pl_module: LightningModule, batch: Any, resume_from_checkpoint=trainer.checkpoint_callback.best_model_path, callbacks=[VerificationCallback()] ) - trainer.fit(model, datamodule=dm) - trainer.test(datamodule=dm) + results = trainer.test(model, datamodule=dm) + # todo (tchaton) resolve different metrics + print(results) @RunIf(min_gpus=2, deepspeed=True) From 712814c5e4b925395be46a99ab9b3498e565e8da Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 17 Mar 2021 11:00:03 +0000 Subject: [PATCH 16/60] update --- tests/plugins/test_deepspeed_plugin.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index b89c2262e111f..5895c0761edf8 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,16 +1,16 @@ import json import os -from pytorch_lightning.callbacks import Callback, ModelCheckpoint from typing import Any import pytest import torch +import torch.nn.functional as F from torch import nn, Tensor from torch.optim import Optimizer -from torch import nn -import torch.nn.functional as F -from pytorch_lightning.metrics import Accuracy -from pytorch_lightning import Trainer, seed_everything + +from pytorch_lightning import LightningModule, seed_everything, Trainer +from pytorch_lightning.callbacks import Callback, ModelCheckpoint +from pytorch_lightning.metrics import Accuracy from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -400,10 +400,6 @@ def forward(self, x): logits = F.softmax(x, dim=1) return logits - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) - return [optimizer], [] - def training_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) @@ -489,11 +485,13 @@ def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint """ seed_everything(42) + class VerificationCallback(Callback): def on_train_batch_start( self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int ) -> None: + deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps From a1644c63885beb1e55adc484154360c84167a0e1 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 17 Mar 2021 11:15:13 +0000 Subject: [PATCH 17/60] Small formatting fixes, clean up some code --- .../connectors/checkpoint_connector.py | 5 +- tests/helpers/pipelines.py | 2 +- tests/helpers/simple_models.py | 8 ++-- tests/plugins/test_deepspeed_plugin.py | 47 +++++++++---------- 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index c68951f9a66c6..09f9d1515f58f 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -91,15 +91,14 @@ def restore(self, checkpoint_path: str, on_gpu: bool) -> bool: return False checkpoint, load_optimizer_states = self.trainer.training_type_plugin.restore_model_state_from_ckpt_path( - checkpoint_path, map_location=lambda storage, loc: storage) + checkpoint_path, map_location=lambda storage, loc: storage + ) model = self.trainer.lightning_module if on_gpu: model.cuda(self.trainer.root_gpu) - print(checkpoint) - # restore training state self.restore_training_state(checkpoint, load_optimizer_states) diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py index ebacad05b0a6f..403bcdfee8c1d 100644 --- a/tests/helpers/pipelines.py +++ b/tests/helpers/pipelines.py @@ -100,7 +100,7 @@ def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50): # run prediction on 1 batch trained_model.cpu() trained_model.eval() - + batch = next(iter(dataloader)) x, y = batch x = x.flatten(1) diff --git a/tests/helpers/simple_models.py b/tests/helpers/simple_models.py index a1b6c03fd00d8..1abeb1f00206a 100644 --- a/tests/helpers/simple_models.py +++ b/tests/helpers/simple_models.py @@ -27,7 +27,7 @@ def __init__(self, lr=0.01): self.lr = lr for i in range(3): setattr(self, f"layer_{i}", nn.Linear(32, 32)) - setattr(self, f"layer_{i}a", nn.ReLU()) + setattr(self, f"layer_{i}a", torch.nn.ReLU()) setattr(self, "layer_end", nn.Linear(32, 3)) self.train_acc = Accuracy() @@ -54,20 +54,20 @@ def training_step(self, batch, batch_idx): logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log('train_loss', loss, prog_bar=True) - self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) + self.log('train_acc', self.train_acc(logits, y), prog_bar=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) + self.log('val_acc', self.valid_acc(logits, y), prog_bar=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) + self.log('test_acc', self.test_acc(logits, y), prog_bar=True) class RegressionModel(LightningModule): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 5895c0761edf8..cfb212651101a 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -21,7 +21,7 @@ def test_deepspeed_lightning_module(tmpdir): """ - Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. + Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. """ model = BoringModel() @@ -39,7 +39,7 @@ def test_deepspeed_lightning_module(tmpdir): @RunIf(min_gpus=1) def test_deepspeed_lightning_module_precision(tmpdir): """ - Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16. + Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16. """ model = BoringModel() @@ -89,7 +89,7 @@ def deepspeed_zero_config(deepspeed_config): @pytest.mark.parametrize("input", ("deepspeed", DeepSpeedPlugin)) def test_deepspeed_plugin_string(tmpdir, input): """ - Test to ensure that the plugin can be passed via string or instance, and parallel devices is correctly set. + Test to ensure that the plugin can be passed via string or instance, and parallel devices is correctly set. """ trainer = Trainer( @@ -133,8 +133,8 @@ def test_deepspeed_plugin_env(tmpdir, monkeypatch, deepspeed_config): ) def test_deepspeed_precision_choice(amp_backend, tmpdir): """ - Test to ensure precision plugin is also correctly chosen. - DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin + Test to ensure precision plugin is also correctly chosen. + DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin """ trainer = Trainer( @@ -165,7 +165,7 @@ def test_deepspeed_with_invalid_config_path(tmpdir): @RunIf(deepspeed=True) def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config): """ - Test to ensure if we pass an env variable, we load the config from the path. + Test to ensure if we pass an env variable, we load the config from the path. """ config_path = os.path.join(tmpdir, 'temp.json') with open(config_path, 'w') as f: @@ -223,8 +223,10 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args @RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_run_configure_optimizers(tmpdir): - """Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), - whilst using configure_optimizers for optimizers and schedulers.""" + """ + Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation), + whilst using configure_optimizers for optimizers and schedulers. + """ class TestModel(BoringModel): @@ -254,8 +256,8 @@ def on_train_start(self) -> None: @RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_config(tmpdir, deepspeed_zero_config): """ - Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers - and saves the model weights to load correctly. + Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers + and saves the model weights to load correctly. """ class TestModel(BoringModel): @@ -348,7 +350,7 @@ def on_train_start(self) -> None: @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu(tmpdir, deepspeed_config): """ - Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. + Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. """ model = BoringModel() trainer = Trainer( @@ -376,9 +378,10 @@ def on_model_parallel_setup(self) -> None: class ModelParallelClassificationModel(LightningModule): - def __init__(self, lr=0.01): + def __init__(self, lr: float = 0.01, num_blocks: int = 3): super().__init__() self.lr = lr + self.num_blocks = num_blocks self.train_acc = Accuracy() self.valid_acc = Accuracy() @@ -388,15 +391,10 @@ def make_block(self): return nn.Sequential(nn.Linear(32, 32, bias=False), nn.BatchNorm1d(32), nn.LeakyReLU()) def on_model_parallel_setup(self) -> None: - for i in range(3): - setattr(self, f"block_{i}", self.make_block()) - setattr(self, "layer_end", nn.Linear(32, 3)) + self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3)) def forward(self, x): - x = self.block_0(x) - x = self.block_1(x) - x = self.block_2(x) - x = self.layer_end(x) + x = self.model(x) logits = F.softmax(x, dim=1) return logits @@ -429,7 +427,7 @@ def configure_optimizers(self): @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): """ - Test to ensure that DeepSpeed with multiple GPUs works, without ZeRO Optimization as this requires compilation. + Test to ensure ZeRO Stage 3 works with a parallel model. """ model = ModelParallelBoringModel() trainer = Trainer( @@ -448,7 +446,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): """ - Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint + Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint """ seed_everything(42) model = ModelParallelClassificationModel(lr=0.1) @@ -480,9 +478,9 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): @RunIf(min_gpus=2, deepspeed=True) -def test_deepspeed_multigpu_stage_2_checkpointing_accumated_grad_batches(tmpdir, deepspeed_config): +def test_deepspeed_multigpu_stage_2_checkpointing_accumulated_grad_batches(tmpdir, deepspeed_config): """ - Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint + Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ seed_everything(42) @@ -491,7 +489,6 @@ class VerificationCallback(Callback): def on_train_batch_start( self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int ) -> None: - deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps @@ -528,7 +525,7 @@ def on_train_batch_start( @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): """ - Test to ensure we can use DeepSpeed with just test. + Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3. """ model = ModelParallelBoringModel() trainer = Trainer( From 64f624f36161208bcb0157a271d7309529620686 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Wed, 17 Mar 2021 22:05:37 +0000 Subject: [PATCH 18/60] Few cleanups --- pytorch_lightning/trainer/training_loop.py | 2 -- tests/plugins/test_deepspeed_plugin.py | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 66a44704ad19a..5420bb038caca 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -632,8 +632,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): opt_idx=opt_idx, ) - self.trainer.training_type_plugin.on_batch - # ------------------------------ # BACKWARD PASS # ------------------------------ diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index cfb212651101a..42158e46a8641 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -386,9 +386,10 @@ def __init__(self, lr: float = 0.01, num_blocks: int = 3): self.train_acc = Accuracy() self.valid_acc = Accuracy() self.test_acc = Accuracy() + self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3)) def make_block(self): - return nn.Sequential(nn.Linear(32, 32, bias=False), nn.BatchNorm1d(32), nn.LeakyReLU()) + return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU()) def on_model_parallel_setup(self) -> None: self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3)) @@ -538,12 +539,12 @@ def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): trainer.test(model) -def _assert_save_model_is_equal(model, tmpdir, trainer): +def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel): checkpoint_path = os.path.join(tmpdir, 'model.pt') trainer.save_checkpoint(checkpoint_path) # carry out the check only on rank 0 if trainer.global_rank == 0: - saved_model = BoringModel.load_from_checkpoint(checkpoint_path) + saved_model = cls.load_from_checkpoint(checkpoint_path) if model.dtype == torch.half: saved_model = saved_model.half() # model is loaded in float32 as default, move it to float16 model = model.cpu() From 89fbbcbb9fd763661f61683c005f63d68b3aad38 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 18 Mar 2021 10:48:33 +0000 Subject: [PATCH 19/60] No need for default state --- pytorch_lightning/plugins/training_type/deepspeed.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 2e255b4255875..ba4a1838b4c28 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -324,9 +324,7 @@ def _initialize_deepspeed_inference(self, model): 'train_micro_batch_size_per_gpu': 1, } if 'fp16' in self.config: - inference_config.update({ - "fp16": self.config["fp16"] - }) + inference_config.update({"fp16": self.config["fp16"]}) if self.zero_stage_3: inference_config.update({ "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'], @@ -494,9 +492,7 @@ def restore_model_state_from_ckpt_path(self, # hook: give user access to checkpoint if needed. self.lightning_module.on_load_checkpoint(client_state) return client_state, False - else: - super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location) - return {}, False + return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location) def _accumulated_batches_reached(self, trainer): return trainer.total_batch_idx % trainer.accumulate_grad_batches == 0 From 701d41758fec1e1178588da24720f0b0b3f8a922 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 18 Mar 2021 12:40:07 +0000 Subject: [PATCH 20/60] Fix tests, add some boilerplate that should move eventually --- .../plugins/training_type/deepspeed.py | 11 ++- tests/plugins/test_deepspeed_plugin.py | 73 ++++++++----------- 2 files changed, 36 insertions(+), 48 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index ba4a1838b4c28..0820f64aa3323 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -203,6 +203,7 @@ def __init__( self.loss_scale_window = loss_scale_window self.hysteresis = hysteresis self.min_loss_scale = min_loss_scale + self.on_model_parallel_setup_called = False def _load_config(self, config): if config is None and self.DEEPSPEED_ENV_VAR in os.environ: @@ -293,11 +294,13 @@ def _initialize_deepspeed_train(self, model): self.model = model def _call_model_parallel_setup(self): - if self.zero_stage_3: - with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): + if not self.on_model_parallel_setup_called: + if self.zero_stage_3: + with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): + self.lightning_module.trainer.call_hook("on_model_parallel_setup") + else: self.lightning_module.trainer.call_hook("on_model_parallel_setup") - else: - self.lightning_module.trainer.call_hook("on_model_parallel_setup") + self.on_model_parallel_setup_called = True def _set_deepspeed_activation_checkpointing(self): if self.config.get('activation_checkpointing'): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 42158e46a8641..e740edc353165 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -378,7 +378,7 @@ def on_model_parallel_setup(self) -> None: class ModelParallelClassificationModel(LightningModule): - def __init__(self, lr: float = 0.01, num_blocks: int = 3): + def __init__(self, lr: float = 0.01, num_blocks: int = 5): super().__init__() self.lr = lr self.num_blocks = num_blocks @@ -386,7 +386,6 @@ def __init__(self, lr: float = 0.01, num_blocks: int = 3): self.train_acc = Accuracy() self.valid_acc = Accuracy() self.test_acc = Accuracy() - self.model = nn.Sequential(*(self.make_block() for x in range(self.num_blocks)), nn.Linear(32, 3)) def make_block(self): return nn.Sequential(nn.Linear(32, 32, bias=False), nn.ReLU()) @@ -396,6 +395,8 @@ def on_model_parallel_setup(self) -> None: def forward(self, x): x = self.model(x) + # Ensure output is in float32 for softmax operation + x = x.float() logits = F.softmax(x, dim=1) return logits @@ -404,25 +405,29 @@ def training_step(self, batch, batch_idx): logits = self.forward(x) loss = F.cross_entropy(logits, y) self.log('train_loss', loss, prog_bar=True) - self.log('train_acc', self.train_acc(logits.argmax(-1), y), prog_bar=True) + self.log('train_acc', self.train_acc(logits, y), prog_bar=True, sync_dist=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) - self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('val_acc', self.valid_acc(logits.argmax(-1), y), prog_bar=True) + self.log('val_loss', F.cross_entropy(logits, y), prog_bar=False, sync_dist=True) + self.log('val_acc', self.valid_acc(logits, y), prog_bar=True, sync_dist=True) def test_step(self, batch, batch_idx): x, y = batch logits = self.forward(x) - self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False) - self.log('test_acc', self.test_acc(logits.argmax(-1), y), prog_bar=True) + self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False, sync_dist=True) + self.log('test_acc', self.test_acc(logits, y), prog_bar=True, sync_dist=True) def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1) - return [optimizer], [lr_scheduler] + + lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) + return [optimizer], [{ + 'scheduler': lr_scheduler, + 'interval': 'step', + }] @RunIf(min_gpus=2, deepspeed=True) @@ -447,15 +452,16 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): @RunIf(min_gpus=2, deepspeed=True) def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): """ - Test to ensure with Stage 3 and multiple GPUs that we can save/load a model, resuming from a checkpoint + Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, + and see convergence. """ seed_everything(42) - model = ModelParallelClassificationModel(lr=0.1) + model = ModelParallelClassificationModel() dm = ClassifDataModule() ck = ModelCheckpoint(monitor="val_acc", mode="max", save_last=True, save_top_k=-1) trainer = Trainer( max_epochs=10, - plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True)], + plugins=[DeepSpeedPlugin(stage=3)], default_root_dir=tmpdir, gpus=2, precision=16, @@ -463,23 +469,18 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): callbacks=[ck] ) trainer.fit(model, datamodule=dm) + results = trainer.test(model, datamodule=dm) - print(results) - results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) - print(results) - trainer = Trainer( - max_epochs=1, - plugins=[DeepSpeedPlugin(stage=3, zero_optimization=True, cpu_offload=True)], - default_root_dir=tmpdir, - gpus=2, - precision=16, - resume_from_checkpoint=ck.best_model_path, - ) - trainer.fit(model, datamodule=dm) + assert results[0]['test_acc'] > 0.7 + + saved_results = trainer.test(ckpt_path=ck.best_model_path, datamodule=dm) + assert saved_results[0]['test_acc'] > 0.7 + assert saved_results == results @RunIf(min_gpus=2, deepspeed=True) -def test_deepspeed_multigpu_stage_2_checkpointing_accumulated_grad_batches(tmpdir, deepspeed_config): +@pytest.mark.parametrize('cpu_offload', [True, False]) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, cpu_offload): """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ @@ -493,34 +494,18 @@ def on_train_batch_start( deepspeed_engine = trainer.training_type_plugin.model assert trainer.global_step == deepspeed_engine.global_steps - model = ModelParallelClassificationModel(lr=0.1) + model = ModelParallelClassificationModel() dm = ClassifDataModule() trainer = Trainer( max_epochs=5, - plugins=[DeepSpeedPlugin(stage=2, zero_optimization=False, cpu_offload=True)], + plugins=[DeepSpeedPlugin(stage=2, cpu_offload=cpu_offload)], gpus=2, limit_val_batches=2, - precision=32, + precision=16, accumulate_grad_batches=2, callbacks=[VerificationCallback()] ) trainer.fit(model, datamodule=dm) - results = trainer.test(datamodule=dm) - - trainer = Trainer( - max_epochs=3, - plugins=[DeepSpeedPlugin(stage=2, zero_optimization=True, cpu_offload=True)], - default_root_dir=tmpdir, - gpus=2, - limit_val_batches=2, - limit_test_batches=2, - precision=16, - resume_from_checkpoint=trainer.checkpoint_callback.best_model_path, - callbacks=[VerificationCallback()] - ) - results = trainer.test(model, datamodule=dm) - # todo (tchaton) resolve different metrics - print(results) @RunIf(min_gpus=2, deepspeed=True) From 270d6ed04843e21537b410f43f66743c73311158 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 22 Mar 2021 17:41:39 +0000 Subject: [PATCH 21/60] Add hook removal --- pytorch_lightning/plugins/training_type/deepspeed.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 0820f64aa3323..3db03c5dd8761 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -35,6 +35,7 @@ if _DEEPSPEED_AVAILABLE: import deepspeed + from deepspeed.runtime.zero.stage3 import remove_module_hooks class LightningDeepSpeedModule(_LightningModuleWrapperBase): @@ -333,6 +334,8 @@ def _initialize_deepspeed_inference(self, model): "zero_allow_untested_optimizer": self.config['zero_allow_untested_optimizer'], "zero_optimization": self.config['zero_optimization'], }) + # Remove all module hooks before initializing new model + remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( args=SimpleNamespace(local_rank=self.local_rank), model=model, From a236ff050cde4ff8c6f98a074e1ba5a1c5bd35d8 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 23 Mar 2021 10:51:04 +0000 Subject: [PATCH 22/60] Add a context manager to handle hook --- pytorch_lightning/accelerators/accelerator.py | 15 +++- .../plugins/training_type/deepspeed.py | 82 +++++++++---------- .../training_type/training_type_plugin.py | 14 +++- pytorch_lightning/trainer/trainer.py | 7 ++ 4 files changed, 73 insertions(+), 45 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 60e6ea88b4250..a5b89215b1e25 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union +import contextlib +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union, Generator import torch from torch.optim import Optimizer @@ -432,3 +433,15 @@ def results(self) -> Any: In distributed training, we make sure to transfer the results to the appropriate master process. """ return self.training_type_plugin.results + + @contextlib.contextmanager + def model_parallel_context(self) -> Generator: + """ + Provide hook to create modules in a parallel aware context. This is useful for when we'd like to + shard the model instantly, which is useful for extremely large models which can save memory and + initialization time. + + Returns: Model parallel context. + """ + with self.training_type_plugin.model_parallel_context(): + yield diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 00738a18f7e33..0a90b5ea0d05f 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -11,13 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import contextlib import json import logging import os from pathlib import Path from types import SimpleNamespace -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Generator import torch @@ -64,33 +64,33 @@ class DeepSpeedPlugin(DDPPlugin): DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH" def __init__( - self, - zero_optimization: bool = True, - stage: int = 2, - cpu_offload: bool = False, - cpu_offload_params: bool = False, - cpu_offload_use_pin_memory: bool = False, - contiguous_gradients: bool = True, - overlap_comm: bool = True, - allgather_partitions: bool = True, - reduce_scatter: bool = True, - allgather_bucket_size: int = 2e8, - reduce_bucket_size: int = 2e8, - zero_allow_untested_optimizer: bool = True, - config: Optional[Union[Path, str, dict]] = None, - logging_level: int = logging.WARN, - num_nodes: int = 1, - parallel_devices: Optional[List[torch.device]] = None, - cluster_environment: Optional[ClusterEnvironment] = None, - loss_scale: float = 0, - initial_scale_power: int = 32, - loss_scale_window: int = 1000, - hysteresis: int = 2, - min_loss_scale: int = 1, - partition_activations: bool = False, - cpu_checkpointing: bool = False, - contiguous_memory_optimization: bool = False, - synchronize_checkpoint_boundary: bool = False, + self, + zero_optimization: bool = True, + stage: int = 2, + cpu_offload: bool = False, + cpu_offload_params: bool = False, + cpu_offload_use_pin_memory: bool = False, + contiguous_gradients: bool = True, + overlap_comm: bool = True, + allgather_partitions: bool = True, + reduce_scatter: bool = True, + allgather_bucket_size: int = 2e8, + reduce_bucket_size: int = 2e8, + zero_allow_untested_optimizer: bool = True, + config: Optional[Union[Path, str, dict]] = None, + logging_level: int = logging.WARN, + num_nodes: int = 1, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + loss_scale: float = 0, + initial_scale_power: int = 32, + loss_scale_window: int = 1000, + hysteresis: int = 2, + min_loss_scale: int = 1, + partition_activations: bool = False, + cpu_checkpointing: bool = False, + contiguous_memory_optimization: bool = False, + synchronize_checkpoint_boundary: bool = False, ) -> None: """ @@ -204,7 +204,6 @@ def __init__( self.loss_scale_window = loss_scale_window self.hysteresis = hysteresis self.min_loss_scale = min_loss_scale - self.on_model_parallel_setup_called = False def _load_config(self, config): if config is None and self.DEEPSPEED_ENV_VAR in os.environ: @@ -237,8 +236,6 @@ def init_deepspeed(self): if self.on_gpu: torch.cuda.set_device(self.root_device) - self._call_model_parallel_setup() - if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) else: @@ -283,14 +280,13 @@ def _initialize_deepspeed_train(self, model): self.lightning_module.trainer.schedulers = [lr_scheduler] self.model = model - def _call_model_parallel_setup(self): - if not self.on_model_parallel_setup_called: - if self.zero_stage_3: - with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): - self.lightning_module.trainer.call_hook("on_model_parallel_setup") - else: - self.lightning_module.trainer.call_hook("on_model_parallel_setup") - self.on_model_parallel_setup_called = True + @contextlib.contextmanager + def model_parallel_context(self) -> Generator: + if self.zero_stage_3: + with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): + yield + else: + super().model_parallel_context() def _set_deepspeed_activation_checkpointing(self): if self.config.get('activation_checkpointing'): @@ -419,9 +415,9 @@ def _format_precision_config(self): raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.") def _create_default_config( - self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool, - cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, - **zero_kwargs + self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool, + cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, + **zero_kwargs ) -> Dict: cfg = { 'activation_checkpointing': { diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 08ccc8115d788..d6bcc16bddb1f 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union +from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union, Generator import torch from torch.nn import Module @@ -209,3 +210,14 @@ def restore_model_state_from_ckpt_path(self, def increment_accumulated_grad_global_step(self, trainer) -> None: trainer.global_step += 1 + + @contextlib.contextmanager + def model_parallel_context(self) -> Generator: + """ + Provide hook to create modules in a parallel aware context. This is useful for when we'd like to + shard the model instantly, which is useful for extremely large models which can save memory and + initialization time. + + Returns: Model parallel context. + """ + yield diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e1472410fe347..34eee2a298eb6 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -432,6 +432,7 @@ def fit( self.accelerator.setup_environment() self.call_setup_hook(model) # allow user to setup lightning_module in accelerator environment self.accelerator.setup(self, model) # note: this sets up self.lightning_module + self.call_model_parallel_hook(model) # allow user to setup in model parallel environment # ---------------------------- # INSPECT THE CORE LOOPS @@ -1075,6 +1076,12 @@ def call_setup_hook(self, model: LightningModule) -> None: self.setup(model, stage=state) model.setup(stage=state) + def call_model_parallel_hook(self, model: LightningModule) -> None: + if not hasattr(self.lightning_module, 'has_model_parallel_setup'): + with self.accelerator.model_parallel_context(): + model.on_model_parallel_setup() + self.lightning_module.has_model_parallel_setup = True + def call_teardown_hook(self, model: LightningModule) -> None: state = self._teardown_state self.profiler.teardown(stage=state) From e1f865e221f02d6a247b1aaf29c1ccfd8f24502b Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Thu, 25 Mar 2021 16:25:25 +0000 Subject: [PATCH 23/60] Small naming cleanup --- .../plugins/training_type/deepspeed.py | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 0a90b5ea0d05f..4e45896f2f7d4 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -17,7 +17,7 @@ import os from pathlib import Path from types import SimpleNamespace -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Generator +from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import torch @@ -64,33 +64,33 @@ class DeepSpeedPlugin(DDPPlugin): DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH" def __init__( - self, - zero_optimization: bool = True, - stage: int = 2, - cpu_offload: bool = False, - cpu_offload_params: bool = False, - cpu_offload_use_pin_memory: bool = False, - contiguous_gradients: bool = True, - overlap_comm: bool = True, - allgather_partitions: bool = True, - reduce_scatter: bool = True, - allgather_bucket_size: int = 2e8, - reduce_bucket_size: int = 2e8, - zero_allow_untested_optimizer: bool = True, - config: Optional[Union[Path, str, dict]] = None, - logging_level: int = logging.WARN, - num_nodes: int = 1, - parallel_devices: Optional[List[torch.device]] = None, - cluster_environment: Optional[ClusterEnvironment] = None, - loss_scale: float = 0, - initial_scale_power: int = 32, - loss_scale_window: int = 1000, - hysteresis: int = 2, - min_loss_scale: int = 1, - partition_activations: bool = False, - cpu_checkpointing: bool = False, - contiguous_memory_optimization: bool = False, - synchronize_checkpoint_boundary: bool = False, + self, + zero_optimization: bool = True, + stage: int = 2, + cpu_offload: bool = False, + cpu_offload_params: bool = False, + cpu_offload_use_pin_memory: bool = False, + contiguous_gradients: bool = True, + overlap_comm: bool = True, + allgather_partitions: bool = True, + reduce_scatter: bool = True, + allgather_bucket_size: int = 2e8, + reduce_bucket_size: int = 2e8, + zero_allow_untested_optimizer: bool = True, + config: Optional[Union[Path, str, dict]] = None, + logging_level: int = logging.WARN, + num_nodes: int = 1, + parallel_devices: Optional[List[torch.device]] = None, + cluster_environment: Optional[ClusterEnvironment] = None, + loss_scale: float = 0, + initial_scale_power: int = 32, + loss_scale_window: int = 1000, + hysteresis: int = 2, + min_loss_scale: int = 1, + partition_activations: bool = False, + cpu_checkpointing: bool = False, + contiguous_memory_optimization: bool = False, + synchronize_checkpoint_boundary: bool = False, ) -> None: """ @@ -415,9 +415,9 @@ def _format_precision_config(self): raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.") def _create_default_config( - self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool, - cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, - **zero_kwargs + self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool, + cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, + **zero_kwargs ) -> Dict: cfg = { 'activation_checkpointing': { @@ -465,15 +465,15 @@ def restore_model_state_from_ckpt_path(self, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: if torch.distributed.get_world_size() > 1: from pytorch_lightning.trainer.states import TrainerState - load_optimizer_states = self.lightning_module.trainer.state == TrainerState.FITTING + stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) if self.zero_stage_3: # TODO: Currently required as this call is missing within the deepspeed engine. self.deepspeed_engine.optimizer._partition_all_parameters() - _, client_state = self.model.load_checkpoint( - save_dir, load_optimizer_states=load_optimizer_states, load_lr_scheduler_states=load_optimizer_states + _, client_state = self.deepspeed_engine.load_checkpoint( + save_dir, load_optimizer_states=stage_is_fit, load_lr_scheduler_states=stage_is_fit ) # restore datamodule states From 80fb792873088955bb411fbe29b4005375eef7d6 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 26 Mar 2021 09:34:36 +0000 Subject: [PATCH 24/60] wip --- .../plugins/training_type/deepspeed.py | 21 ++++++++++++---- tests/plugins/test_deepspeed_plugin.py | 24 ++++++++++++++++++- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 4e45896f2f7d4..12f2ae93e925c 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib +from collections import OrderedDict import json import logging import os @@ -35,7 +36,17 @@ if _DEEPSPEED_AVAILABLE: import deepspeed - from deepspeed.runtime.zero.stage3 import remove_module_hooks + # from deepspeed.runtime.zero.stage3 import remove_module_hooks + + +def remove_module_hooks(model: torch.nn.Module) -> None: + for module in model.modules(): + module._backward_hooks = OrderedDict() + module._is_full_backward_hook = None + module._forward_hooks = OrderedDict() + module._forward_pre_hooks = OrderedDict() + module._state_dict_hooks = OrderedDict() + module._load_state_dict_pre_hooks = OrderedDict() class LightningDeepSpeedModule(_LightningModuleWrapperBase): @@ -283,10 +294,12 @@ def _initialize_deepspeed_train(self, model): @contextlib.contextmanager def model_parallel_context(self) -> Generator: if self.zero_stage_3: - with deepspeed.zero.Init(remote_device="cpu", pin_memory=True): - yield + model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True) else: - super().model_parallel_context() + model_parallel_context = super().model_parallel_context() + + with model_parallel_context: + yield def _set_deepspeed_activation_checkpointing(self): if self.config.get('activation_checkpointing'): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 3fe121e2ef565..7c7e77df56285 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -420,6 +420,11 @@ def test_step(self, batch, batch_idx): self.log('test_loss', F.cross_entropy(logits, y), prog_bar=False, sync_dist=True) self.log('test_acc', self.test_acc(logits, y), prog_bar=True, sync_dist=True) + def predict_step(self, batch, batch_idx): + x, y = batch + logits = self.forward(x) + return self.test_acc(logits, y).compute() + def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) @@ -450,7 +455,7 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): @RunIf(min_gpus=2, deepspeed=True) -def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): +def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir): """ Test to ensure with Stage 3 and multiple GPUs that we can save/load a model resuming from a checkpoint, and see convergence. @@ -477,6 +482,23 @@ def test_deepspeed_multigpu_stage_3_checkpointing(tmpdir, deepspeed_config): assert saved_results[0]['test_acc'] > 0.7 assert saved_results == results + trainer = Trainer( + max_epochs=10, + plugins=[DeepSpeedPlugin(stage=3)], + default_root_dir=tmpdir, + gpus=2, + precision=16, + accumulate_grad_batches=2, + callbacks=[ck], + resume_from_checkpoint=ck.best_model_path + ) + results = trainer.test(model, datamodule=dm) + assert results[0]['test_acc'] > 0.7 + + dm.predict_dataloader = dm.test_dataloader + results = trainer.predict(model, datamodule=dm) + assert results[0]['test_acc'] > 0.7 + @RunIf(min_gpus=2, deepspeed=True) @pytest.mark.parametrize('cpu_offload', [True, False]) From 1de2bcd8da73df727d417d0a3d71dc4adf0728b3 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 26 Mar 2021 10:23:35 +0000 Subject: [PATCH 25/60] move save_checkpoint responsability to accelerator --- pytorch_lightning/accelerators/accelerator.py | 14 ++- .../plugins/training_type/deepspeed.py | 10 +- .../plugins/training_type/tpu_spawn.py | 11 +- .../training_type/training_type_plugin.py | 28 ++++- .../connectors/checkpoint_connector.py | 107 +----------------- pytorch_lightning/utilities/cloud_io.py | 103 +++++++++++++++++ 6 files changed, 157 insertions(+), 116 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index d4cabd088b26b..4b5b6ede2f10b 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -17,7 +17,7 @@ import torch from torch.optim import Optimizer from torch.utils.data import DataLoader - +import pytorch_lightning as pl from pytorch_lightning.core import LightningModule from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.plugins.training_type import TrainingTypePlugin @@ -479,3 +479,15 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None: ' It will be removed in v1.5.' ) self.setup_precision_plugin(plugin) + + def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None: + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + filepath: write-target file's path + weights_only: saving model weights only + """ + self.training_type_plugin.save_checkpoint(trainer, filepath, weights_only) + + + diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 12f2ae93e925c..89282aac20e39 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -33,6 +33,8 @@ from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE +from pytorch_lightning.utilities.cloud_io import dump_checkpoint + if _DEEPSPEED_AVAILABLE: import deepspeed @@ -231,7 +233,6 @@ def _load_config(self, config): def pre_dispatch(self): self.init_deepspeed() - self.lightning_module.trainer.save_checkpoint = self.save_checkpoint self.barrier() def init_deepspeed(self): @@ -455,7 +456,7 @@ def _filepath_to_dir(self, filepath: str): def deepspeed_engine(self): return self.model - def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: + def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: @@ -465,13 +466,14 @@ def save_checkpoint(self, filepath: str, weights_only: bool = False) -> None: if torch.distributed.get_world_size() > 1: # Use deepspeed's internal checkpointing function to handle partitioned weights across processes # dump states as a checkpoint dictionary object - client_state = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) + client_state = dump_checkpoint(trainer, weights_only) save_dir = self._filepath_to_dir(filepath) _exclude_keys = ['state_dict', 'optimizer_states', 'lr_schedulers'] client_state = {k: v for k, v in client_state.items() if k not in _exclude_keys} self.deepspeed_engine.save_checkpoint(save_dir, client_state=client_state) + else: - self.lightning_module.trainer.checkpoint_connector.save_checkpoint(filepath) + super().save_checkpoint(trainer, filepath, weights_only) def restore_model_state_from_ckpt_path(self, ckpt_path: str, diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index a8706d54cb5c9..d8d2267547877 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -18,7 +18,7 @@ import torch import torch.multiprocessing as mp - +import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle @@ -27,6 +27,9 @@ from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything +from pytorch_lightning.utilities.cloud_io import dump_checkpoint + + if _TPU_AVAILABLE: import torch_xla.core.xla_model as xm @@ -106,8 +109,6 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None: trainer.accelerator.setup_optimizers(trainer) trainer.precision_plugin.connect(self._model, None, None) - # replace trainer save_checkpoint to use `xm.save` - trainer.save_checkpoint = self.save_checkpoint self.barrier("pre-run-stage") results = trainer.run_stage() @@ -298,7 +299,7 @@ def test_step(self, *args, **kwargs): def predict_step(self, *args, **kwargs): return self.lightning_module.predict_step(*args, **kwargs) - def save_checkpoint(self, filepath, weights_only: bool = False): + def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: @@ -306,6 +307,6 @@ def save_checkpoint(self, filepath, weights_only: bool = False): weights_only: saving model weights only """ # dump states as a checkpoint dictionary object - _checkpoint = self.lightning_module.trainer.checkpoint_connector.dump_checkpoint(weights_only) + _checkpoint = dump_checkpoint(trainer, weights_only) # Todo: TypeError: 'mappingproxy' object does not support item assignment self.save({k: v for k, v in _checkpoint.items() if k != "callbacks"}, filepath) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 086be8446857f..e871e9135b320 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -14,6 +14,8 @@ import contextlib from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union, Generator +from pytorch_lightning.utilities.cloud_io import atomic_save +from pytorch_lightning.utilities import rank_zero_warn import torch from torch.nn import Module @@ -23,7 +25,7 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.base_plugin import Plugin -from pytorch_lightning.utilities.cloud_io import load as pl_load +from pytorch_lightning.utilities.cloud_io import load as pl_load, dump_checkpoint if TYPE_CHECKING: from pytorch_lightning.trainer.trainer import Trainer @@ -221,3 +223,27 @@ def model_parallel_context(self) -> Generator: Returns: Model parallel context. """ yield + + def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None: + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + filepath: write-target file's path + weights_only: saving model weights only + """ + # dump states as a checkpoint dictionary object + checkpoint = dump_checkpoint(trainer, weights_only) + if trainer.is_global_zero: + # write the checkpoint dictionary on the file + + checkpoint = self.on_save(checkpoint) + try: + atomic_save(checkpoint, filepath) + except AttributeError as err: + if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: + del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] + rank_zero_warn( + 'Warning, `hyper_parameters` dropped from checkpoint.' + f' An attribute is not picklable {err}' + ) + atomic_save(checkpoint, filepath) \ No newline at end of file diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 09f9d1515f58f..d79863a86fc77 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -19,7 +19,7 @@ import torch -import pytorch_lightning +import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import ( _APEX_AVAILABLE, @@ -236,94 +236,6 @@ def hpc_save(self, folderpath: str, logger): return filepath - def dump_checkpoint(self, weights_only: bool = False) -> dict: - """Creating a model checkpoint dictionary object from various component states. - - Args: - weights_only: saving model weights only - - Return: - structured dictionary: { - 'epoch': training epoch - 'global_step': training global step - 'pytorch-lightning_version': PyTorch Lightning's version - 'callbacks': "callback specific state"[] # if not weights_only - 'optimizer_states': "PT optim's state_dict"[] # if not weights_only - 'lr_schedulers': "PT sched's state_dict"[] # if not weights_only - 'native_amp_scaling_state': PT amp's state_dict # if not weights_only and use native amp - 'amp_scaling_state': Apex's state_dict # if not weights_only and use apex amp - 'state_dict': Model's state_dict (e.g. network weights) - CHECKPOINT_HYPER_PARAMS_NAME: - CHECKPOINT_HYPER_PARAMS_KEY: - CHECKPOINT_HYPER_PARAMS_TYPE: - something_cool_i_want_to_save: anything you define through model.on_save_checkpoint - LightningDataModule.__class__.__name__: pl DataModule's state - } - """ - - # dump epoch/global_step/pytorch-lightning_version - current_epoch = self.trainer.current_epoch - global_step = self.trainer.global_step - has_reached_max_steps = self.trainer.max_steps and self.trainer.max_steps <= global_step - - global_step += 1 - if not has_reached_max_steps: - current_epoch += 1 - - model = self.trainer.lightning_module - - checkpoint = { - 'epoch': current_epoch, - 'global_step': global_step, - 'pytorch-lightning_version': pytorch_lightning.__version__, - 'state_dict': model.state_dict(), - } - - if not weights_only: - # dump callbacks - checkpoint['callbacks'] = self.trainer.on_save_checkpoint(checkpoint) - - optimizer_states = [] - for i, optimizer in enumerate(self.trainer.optimizers): - # Rely on accelerator to dump optimizer state - optimizer_state = self.trainer.accelerator.optimizer_state(optimizer) - optimizer_states.append(optimizer_state) - - checkpoint['optimizer_states'] = optimizer_states - - # dump lr schedulers - lr_schedulers = [] - for scheduler in self.trainer.lr_schedulers: - lr_schedulers.append(scheduler['scheduler'].state_dict()) - checkpoint['lr_schedulers'] = lr_schedulers - - # dump amp scaling - if ( - self.trainer.amp_backend == AMPType.NATIVE and self.trainer._device_type != DeviceType.TPU - and self.trainer.scaler is not None - ): - checkpoint['native_amp_scaling_state'] = self.trainer.scaler.state_dict() - elif self.trainer.amp_backend == AMPType.APEX: - checkpoint['amp_scaling_state'] = amp.state_dict() - - # dump hyper-parameters - if model.hparams: - if hasattr(model, '_hparams_name'): - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name - # dump arguments - if _OMEGACONF_AVAILABLE and isinstance(model.hparams, Container): - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams) - else: - checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams) - - # give the model a chance to dump a few things - model.on_save_checkpoint(checkpoint) - if self.trainer.datamodule is not None: - self.trainer.datamodule.on_save_checkpoint(checkpoint) - - return checkpoint - def hpc_load(self, checkpoint_path: str, on_gpu: bool): """ Load model/training states from a 'PyTorch-Lightning checkpoint' file for hpc. @@ -394,19 +306,4 @@ def save_checkpoint(self, filepath, weights_only: bool = False): weights_only: saving model weights only """ # dump states as a checkpoint dictionary object - checkpoint = self.dump_checkpoint(weights_only) - if self.trainer.is_global_zero: - # write the checkpoint dictionary on the file - - if self.trainer.training_type_plugin: - checkpoint = self.trainer.training_type_plugin.on_save(checkpoint) - try: - atomic_save(checkpoint, filepath) - except AttributeError as err: - if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: - del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] - rank_zero_warn( - 'Warning, `hyper_parameters` dropped from checkpoint.' - f' An attribute is not picklable {err}' - ) - atomic_save(checkpoint, filepath) + self.trainer.accelerator.save_checkpoint(self.trainer, filepath, weights_only) \ No newline at end of file diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py index e94934020107d..febaefd24cf2e 100644 --- a/pytorch_lightning/utilities/cloud_io.py +++ b/pytorch_lightning/utilities/cloud_io.py @@ -19,6 +19,19 @@ import fsspec import torch +import pytorch_lightning as pl +from pytorch_lightning.utilities import ( + _APEX_AVAILABLE, + _OMEGACONF_AVAILABLE, + AMPType, + DeviceType, +) + +if _APEX_AVAILABLE: + from apex import amp + +if _OMEGACONF_AVAILABLE: + from omegaconf import Container def load(path_or_url: Union[str, IO, Path], map_location=None): @@ -63,3 +76,93 @@ def atomic_save(checkpoint, filepath: str): torch.save(checkpoint, bytesbuffer) with fsspec.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) + + +def dump_checkpoint(trainer: 'pl.Trainer', weights_only: bool = False) -> dict: + """Creating a model checkpoint dictionary object from various component states. + + Args: + weights_only: saving model weights only + + Return: + structured dictionary: { + 'epoch': training epoch + 'global_step': training global step + 'pytorch-lightning_version': PyTorch Lightning's version + 'callbacks': "callback specific state"[] # if not weights_only + 'optimizer_states': "PT optim's state_dict"[] # if not weights_only + 'lr_schedulers': "PT sched's state_dict"[] # if not weights_only + 'native_amp_scaling_state': PT amp's state_dict # if not weights_only and use native amp + 'amp_scaling_state': Apex's state_dict # if not weights_only and use apex amp + 'state_dict': Model's state_dict (e.g. network weights) + CHECKPOINT_HYPER_PARAMS_NAME: + CHECKPOINT_HYPER_PARAMS_KEY: + CHECKPOINT_HYPER_PARAMS_TYPE: + something_cool_i_want_to_save: anything you define through model.on_save_checkpoint + LightningDataModule.__class__.__name__: pl DataModule's state + } + """ + from pytorch_lightning import LightningModule + + # dump epoch/global_step/pytorch-lightning_version + current_epoch = trainer.current_epoch + global_step = trainer.global_step + has_reached_max_steps = trainer.max_steps and trainer.max_steps <= global_step + + global_step += 1 + if not has_reached_max_steps: + current_epoch += 1 + + model = trainer.lightning_module + + checkpoint = { + 'epoch': current_epoch, + 'global_step': global_step, + 'pytorch-lightning_version': pl.__version__, + 'state_dict': model.state_dict(), + } + + if not weights_only: + # dump callbacks + checkpoint['callbacks'] = trainer.on_save_checkpoint(checkpoint) + + optimizer_states = [] + for i, optimizer in enumerate(trainer.optimizers): + # Rely on accelerator to dump optimizer state + optimizer_state = trainer.accelerator.optimizer_state(optimizer) + optimizer_states.append(optimizer_state) + + checkpoint['optimizer_states'] = optimizer_states + + # dump lr schedulers + lr_schedulers = [] + for scheduler in trainer.lr_schedulers: + lr_schedulers.append(scheduler['scheduler'].state_dict()) + checkpoint['lr_schedulers'] = lr_schedulers + + # dump amp scaling + if ( + trainer.amp_backend == AMPType.NATIVE and trainer._device_type != DeviceType.TPU + and trainer.scaler is not None + ): + checkpoint['native_amp_scaling_state'] = trainer.scaler.state_dict() + elif trainer.amp_backend == AMPType.APEX: + checkpoint['amp_scaling_state'] = amp.state_dict() + + # dump hyper-parameters + if model.hparams: + if hasattr(model, '_hparams_name'): + checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name + # dump arguments + if _OMEGACONF_AVAILABLE and isinstance(model.hparams, Container): + checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams + checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_TYPE] = type(model.hparams) + else: + checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams) + + # give the model a chance to dump a few things + model.on_save_checkpoint(checkpoint) + if trainer.datamodule is not None: + trainer.datamodule.on_save_checkpoint(checkpoint) + + return checkpoint From 90d6e03ce13a4845f70ce728576bafa6434cb2cf Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 26 Mar 2021 10:28:17 +0000 Subject: [PATCH 26/60] resolve flake8 --- pytorch_lightning/accelerators/accelerator.py | 8 +++----- .../plugins/training_type/deepspeed.py | 11 +++++------ .../plugins/training_type/tpu_spawn.py | 5 ++--- .../training_type/training_type_plugin.py | 13 +++++++------ .../connectors/checkpoint_connector.py | 19 ++++--------------- pytorch_lightning/trainer/trainer.py | 1 - pytorch_lightning/utilities/imports.py | 2 ++ 7 files changed, 23 insertions(+), 36 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 4b5b6ede2f10b..a7e6f20652576 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union, Generator +from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union import torch from torch.optim import Optimizer from torch.utils.data import DataLoader + import pytorch_lightning as pl from pytorch_lightning.core import LightningModule from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin @@ -480,7 +481,7 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None: ) self.setup_precision_plugin(plugin) - def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None: + def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: @@ -488,6 +489,3 @@ def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: boo weights_only: saving model weights only """ self.training_type_plugin.save_checkpoint(trainer, filepath, weights_only) - - - diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 89282aac20e39..c67c0c7e6096f 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib -from collections import OrderedDict import json import logging import os +from collections import OrderedDict from pathlib import Path from types import SimpleNamespace from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import torch +import pytorch_lightning as pl from pytorch_lightning.callbacks import GradientAccumulationScheduler from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase @@ -30,15 +31,13 @@ from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.apply_func import apply_to_collection +from pytorch_lightning.utilities.cloud_io import dump_checkpoint from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE -from pytorch_lightning.utilities.cloud_io import dump_checkpoint - if _DEEPSPEED_AVAILABLE: import deepspeed - # from deepspeed.runtime.zero.stage3 import remove_module_hooks def remove_module_hooks(model: torch.nn.Module) -> None: @@ -298,7 +297,7 @@ def model_parallel_context(self) -> Generator: model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True) else: model_parallel_context = super().model_parallel_context() - + with model_parallel_context: yield @@ -456,7 +455,7 @@ def _filepath_to_dir(self, filepath: str): def deepspeed_engine(self): return self.model - def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None: + def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index d8d2267547877..4f9f3427c5d7c 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -18,18 +18,17 @@ import torch import torch.multiprocessing as mp + import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _TPU_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities.cloud_io import dump_checkpoint from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything -from pytorch_lightning.utilities.cloud_io import dump_checkpoint - - if _TPU_AVAILABLE: import torch_xla.core.xla_model as xm diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index e871e9135b320..0bcc240f14fad 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -13,19 +13,20 @@ # limitations under the License. import contextlib from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Iterable, Optional, Tuple, TYPE_CHECKING, Union, Generator -from pytorch_lightning.utilities.cloud_io import atomic_save -from pytorch_lightning.utilities import rank_zero_warn +from typing import Any, Callable, Dict, Generator, Iterable, Optional, Tuple, TYPE_CHECKING, Union import torch from torch.nn import Module from torch.optim import Optimizer from torch.utils.data import DataLoader +import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.base_plugin import Plugin -from pytorch_lightning.utilities.cloud_io import load as pl_load, dump_checkpoint +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.cloud_io import atomic_save, dump_checkpoint +from pytorch_lightning.utilities.cloud_io import load as pl_load if TYPE_CHECKING: from pytorch_lightning.trainer.trainer import Trainer @@ -224,7 +225,7 @@ def model_parallel_context(self) -> Generator: """ yield - def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: bool = False) -> None: + def save_checkpoint(self, trainer: 'pl.Trainer', filepath: str, weights_only: bool = False) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. Args: @@ -246,4 +247,4 @@ def save_checkpoint(self, trainer: 'pl.Trainer', filepath:str, weights_only: boo 'Warning, `hyper_parameters` dropped from checkpoint.' f' An attribute is not picklable {err}' ) - atomic_save(checkpoint, filepath) \ No newline at end of file + atomic_save(checkpoint, filepath) diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index d79863a86fc77..286585c168782 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -19,17 +19,9 @@ import torch -import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import ( - _APEX_AVAILABLE, - _OMEGACONF_AVAILABLE, - AMPType, - DeviceType, - rank_zero_info, - rank_zero_warn, -) -from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem +from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType, DeviceType, rank_zero_info, rank_zero_warn +from pytorch_lightning.utilities.cloud_io import atomic_save, dump_checkpoint, get_filesystem from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS @@ -37,9 +29,6 @@ if _APEX_AVAILABLE: from apex import amp -if _OMEGACONF_AVAILABLE: - from omegaconf import Container - class CheckpointConnector: @@ -215,7 +204,7 @@ def hpc_save(self, folderpath: str, logger): # give model a chance to do something on hpc_save model = self.trainer.lightning_module - checkpoint = self.dump_checkpoint() + checkpoint = dump_checkpoint(self.trainer) model.on_hpc_save(checkpoint) @@ -306,4 +295,4 @@ def save_checkpoint(self, filepath, weights_only: bool = False): weights_only: saving model weights only """ # dump states as a checkpoint dictionary object - self.trainer.accelerator.save_checkpoint(self.trainer, filepath, weights_only) \ No newline at end of file + self.trainer.accelerator.save_checkpoint(self.trainer, filepath, weights_only) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 95b01aba1ecec..be584637d40d7 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -57,7 +57,6 @@ from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin from pytorch_lightning.tuner.tuning import Tuner -from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities import DeviceType, rank_zero_warn from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 87a503e5106a8..bf940e693d5e0 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -62,6 +62,7 @@ def _compare_version(package: str, op, version) -> bool: return True return op(pkg_version, LooseVersion(version)) + def _is_kineto_available() -> bool: _KINETO_AVAILABLE = False if _TORCH_GREATER_EQUAL_1_8: @@ -71,6 +72,7 @@ def _is_kineto_available() -> bool: _KINETO_AVAILABLE = kineto_available_fx() return _KINETO_AVAILABLE + _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0") From b6361b8e0fea65b89bf3903837d899e76bd9876d Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 26 Mar 2021 15:02:30 +0000 Subject: [PATCH 27/60] add BC --- pytorch_lightning/trainer/connectors/checkpoint_connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index d79863a86fc77..baad964c3797d 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -29,7 +29,7 @@ rank_zero_info, rank_zero_warn, ) -from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem +from pytorch_lightning.utilities.cloud_io import atomic_save, dump_checkpoint, get_filesystem from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS @@ -298,6 +298,9 @@ def get_max_ckpt_path_from_folder(self, folder_path: Union[str, Path]) -> str: ckpt_number = max_suffix if max_suffix is not None else 0 return f'{folder_path}/hpc_ckpt_{ckpt_number}.ckpt' + def dump_checkpoint(self, weights_only: bool = False) -> dict: + return dump_checkpoint(self.trainer, weights_only) + def save_checkpoint(self, filepath, weights_only: bool = False): """Save model/training states as a checkpoint file through state-dump and file-write. From 6acaccb395a578f8f60caab45d059d47ad5d110e Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 29 Mar 2021 10:49:02 +0100 Subject: [PATCH 28/60] Change recommended scale to 16 --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index c67c0c7e6096f..7f85716f65ac8 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -95,7 +95,7 @@ def __init__( parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, loss_scale: float = 0, - initial_scale_power: int = 32, + initial_scale_power: int = 16, loss_scale_window: int = 1000, hysteresis: int = 2, min_loss_scale: int = 1, From 68b8a43a6f959244e21cc25f21f448bec1f5a2af Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 09:05:45 +0100 Subject: [PATCH 29/60] resolve flake8 --- pytorch_lightning/accelerators/accelerator.py | 1 - pytorch_lightning/plugins/training_type/deepspeed.py | 1 - pytorch_lightning/plugins/training_type/tpu_spawn.py | 1 - .../plugins/training_type/training_type_plugin.py | 1 - pytorch_lightning/trainer/trainer.py | 5 ++++- pytorch_lightning/utilities/cloud_io.py | 1 - 6 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index e947f7831efba..7d16d91e3bf82 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -18,7 +18,6 @@ from torch.optim import Optimizer from torch.utils.data import DataLoader -import pytorch_lightning as pl from pytorch_lightning.core import LightningModule from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.plugins.training_type import TrainingTypePlugin diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 249a11ddb7ff7..59eff80b8c358 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -22,7 +22,6 @@ import torch -import pytorch_lightning as pl from pytorch_lightning.callbacks import GradientAccumulationScheduler from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import _LightningModuleWrapperBase diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 659937bbdfa48..ba074e7cfb206 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -19,7 +19,6 @@ import torch import torch.multiprocessing as mp -import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin from pytorch_lightning.plugins.training_type.utils import on_colab_kaggle diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 9023e29852084..44de046b57108 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -20,7 +20,6 @@ from torch.optim import Optimizer from torch.utils.data import DataLoader -import pytorch_lightning as pl from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.base_plugin import Plugin diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d2582f76d0633..d565f0906e59e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1086,7 +1086,10 @@ def call_setup_hook(self, model: LightningModule) -> None: def call_configure_sharded_model(self, model: LightningModule) -> None: # Call configure sharded model hook if accelerator requests. In some cases # we will not call the hook; the hook has initialized the sharded model for example. - if self.accelerator.call_configure_sharded_model_hook and not getattr(model, "call_configure_sharded_model_hook", False): + + # used on the model if the user re-create a trainer with resume_from_checkpoint + model_call_configure_sharded_model_hook = getattr(model, "call_configure_sharded_model_hook", False) + if self.accelerator.call_configure_sharded_model_hook and not model_call_configure_sharded_model_hook: with self.accelerator.model_sharded_context(): model.configure_sharded_model() self.configure_sharded_model(model) diff --git a/pytorch_lightning/utilities/cloud_io.py b/pytorch_lightning/utilities/cloud_io.py index 82e02cba42e27..e94934020107d 100644 --- a/pytorch_lightning/utilities/cloud_io.py +++ b/pytorch_lightning/utilities/cloud_io.py @@ -63,4 +63,3 @@ def atomic_save(checkpoint, filepath: str): torch.save(checkpoint, bytesbuffer) with fsspec.open(filepath, "wb") as f: f.write(bytesbuffer.getvalue()) - From a7dcb7b88137982552f82010949b6b9f482c0d56 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 08:23:32 +0000 Subject: [PATCH 30/60] update test --- tests/trainer/test_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index ee93ca59eca76..9c3ee6ceeef5b 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1475,7 +1475,7 @@ def test_trainer_predict_dp(tmpdir, num_gpus): predict(tmpdir, "dp", num_gpus, None) -@RunIf(min_gpus=2, special=True) +@RunIf(min_gpus=2, special=True, fairscale=True) def test_trainer_predict_ddp(tmpdir): predict(tmpdir, "ddp", 2, None, plugins=["ddp_sharded"]) From 6b08478ca9b7ead0f4cf899c9b9ed7f012b4ef47 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 08:26:39 +0000 Subject: [PATCH 31/60] update install --- requirements/extra.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/extra.txt b/requirements/extra.txt index 715916c4e36ac..cf18020b5714b 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -9,3 +9,4 @@ onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip +git+https://github.com/microsoft/DeepSpeed.git From 45a49c5b1f28ea7d24a440648c0080b208bd2047 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 08:50:23 +0000 Subject: [PATCH 32/60] update --- .github/workflows/ci_test-full.yml | 4 ++++ requirements/extra.txt | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index ec9e71c5b83b2..24e1561540fe4 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -117,6 +117,7 @@ jobs: # pip uninstall -y horovod python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" + - name: Install dependencies env: # MAKEFLAGS: "-j2" @@ -132,6 +133,9 @@ jobs: python ./requirements/adjust_versions.py requirements/extra.txt python ./requirements/adjust_versions.py requirements/examples.txt pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade + + # install latest version of DeepSpeed - temporary solution until next release + pip install git+https://github.com/microsoft/DeepSpeed.git pip list shell: bash diff --git a/requirements/extra.txt b/requirements/extra.txt index cf18020b5714b..d9fc96bce1eeb 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -8,5 +8,4 @@ torchtext>=0.5 onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs -https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip -git+https://github.com/microsoft/DeepSpeed.git +https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip \ No newline at end of file From a8da29954fda963fc290ac77970ad22c9ab83a7e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 09:13:52 +0000 Subject: [PATCH 33/60] update test --- azure-pipelines.yml | 2 ++ tests/plugins/test_deepspeed_plugin.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 85664bac74b67..2491f21fee285 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,6 +63,8 @@ jobs: python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir + # install latest version of DeepSpeed - temporary solution until next release + pip install git+https://github.com/microsoft/DeepSpeed.git pip list displayName: 'Install dependencies' diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 95192d44a008f..a515cbbfb813f 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,6 +1,7 @@ import json import os from typing import Any +from unittest.mock import call import pytest import torch @@ -9,6 +10,7 @@ from torch.optim import Optimizer from pytorch_lightning import LightningModule, seed_everything, Trainer +from pytorch_lightning import callbacks from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.metrics import Accuracy from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin @@ -324,24 +326,26 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): assert checkpoint_config['synchronize_checkpoint_boundary'] -@RunIf(min_gpus=1, deepspeed=True) +#@RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.""" deepspeed_zero_config['zero_optimization']['cpu_offload'] = False - class TestModel(BoringModel): + class TestCallback(Callback): - def on_train_start(self) -> None: - assert self.trainer.training_type_plugin.config['zero_optimization']['cpu_offload'] is False + def on_before_accelerator_backend_setup(self, trainer, pl_module) -> None: + assert trainer.training_type_plugin.config['zero_optimization']['cpu_offload'] is False raise SystemExit() - model = TestModel() + model = BoringModel() trainer = Trainer( + max_epochs=1, plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], precision=16, gpus=1, default_root_dir=tmpdir, + callbacks=[TestCallback()] ) with pytest.raises(SystemExit): trainer.fit(model) From 99f1d960aa46f8945fd076c0e59bf45641e56e4d Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 10:15:33 +0100 Subject: [PATCH 34/60] update --- tests/plugins/test_deepspeed_plugin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index a515cbbfb813f..d3a7784600690 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,7 +1,6 @@ import json import os from typing import Any -from unittest.mock import call import pytest import torch @@ -10,7 +9,6 @@ from torch.optim import Optimizer from pytorch_lightning import LightningModule, seed_everything, Trainer -from pytorch_lightning import callbacks from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.metrics import Accuracy from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin @@ -326,7 +324,7 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): assert checkpoint_config['synchronize_checkpoint_boundary'] -#@RunIf(min_gpus=1, deepspeed=True) +@RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.""" From 89601d85ed19ffe4f849af101c4f7699040a212e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 09:17:45 +0000 Subject: [PATCH 35/60] update --- .github/workflows/ci_test-full.yml | 3 --- azure-pipelines.yml | 1 - requirements/extra.txt | 3 ++- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 24e1561540fe4..0af812a0a172f 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -133,9 +133,6 @@ jobs: python ./requirements/adjust_versions.py requirements/extra.txt python ./requirements/adjust_versions.py requirements/examples.txt pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - - # install latest version of DeepSpeed - temporary solution until next release - pip install git+https://github.com/microsoft/DeepSpeed.git pip list shell: bash diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 2491f21fee285..43953cc6ffaf6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -64,7 +64,6 @@ jobs: pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir # install latest version of DeepSpeed - temporary solution until next release - pip install git+https://github.com/microsoft/DeepSpeed.git pip list displayName: 'Install dependencies' diff --git a/requirements/extra.txt b/requirements/extra.txt index d9fc96bce1eeb..1175466a3df5e 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -8,4 +8,5 @@ torchtext>=0.5 onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs -https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip \ No newline at end of file +https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip +deepspeed \ No newline at end of file From 389c60b9705d57218e7355ca33fcf3f8422166f3 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 11:26:06 +0000 Subject: [PATCH 36/60] update test --- pytorch_lightning/accelerators/accelerator.py | 3 + .../plugins/training_type/deepspeed.py | 24 +++--- .../training_type/training_type_plugin.py | 18 ++--- pytorch_lightning/trainer/training_loop.py | 3 +- pytorch_lightning/utilities/imports.py | 12 +-- tests/plugins/test_deepspeed_plugin.py | 77 ++++++++++--------- tests/special_tests.sh | 18 ++++- 7 files changed, 80 insertions(+), 75 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 7d16d91e3bf82..37725e1c77f4b 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -511,3 +511,6 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: Returns: If True, delay setup optimizers till pre_dispatch, else call within setup. """ return self.training_type_plugin.setup_optimizers_in_pre_dispatch + + def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + return self.training_type_plugin.compute_new_global_step(total_batch_idx, current_global_step) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 59eff80b8c358..a3fde3e448729 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -290,11 +290,11 @@ def _initialize_deepspeed_train(self, model): self.model = model @contextlib.contextmanager - def model_parallel_context(self) -> Generator: + def model_sharded_context(self) -> Generator: if self.zero_stage_3: model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True) else: - model_parallel_context = super().model_parallel_context() + model_parallel_context = super().model_sharded_context() with model_parallel_context: yield @@ -333,7 +333,6 @@ def _initialize_deepspeed_inference(self, model): # Remove all module hooks before initializing new model remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( - args=SimpleNamespace(local_rank=self.local_rank), model=model, optimizer=optimizer, lr_scheduler=lightning_scheduler, @@ -460,7 +459,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: filepath: write-target file's path weights_only: saving model weights only """ - if torch.distributed.get_world_size() > 1: + if torch.distributed.get_world_size() > 1 and self.zero_stage_3: # Use deepspeed's internal checkpointing function to handle partitioned weights across processes # dump states as a checkpoint dictionary object save_dir = self._filepath_to_dir(filepath) @@ -476,7 +475,6 @@ def restore_model_state_from_ckpt_path(self, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: if torch.distributed.get_world_size() > 1: from pytorch_lightning.trainer.states import TrainerState - print("restore_model_state_from_ckpt_path") stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) @@ -497,14 +495,14 @@ def restore_model_state_from_ckpt_path(self, return client_state, False return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location) - def _accumulated_batches_reached(self, trainer): - return trainer.total_batch_idx % trainer.accumulate_grad_batches == 0 + def _accumulated_batches_reached(self, total_batch_idx: int) -> bool: + return total_batch_idx % self._original_accumulate_grad_batches == 0 - def increment_accumulated_grad_global_step(self, trainer): + def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int: if self._original_accumulate_grad_batches is None: - trainer.global_step += 1 + return current_global_step + 1 else: - trainer.accumulate_grad_batches = self._original_accumulate_grad_batches - if self._accumulated_batches_reached(trainer): - trainer.global_step += 1 - trainer.accumulate_grad_batches = 1 + if self._accumulated_batches_reached(total_batch_idx, ): + current_global_step += 1 + return current_global_step + diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 44de046b57108..7c686ef140a05 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -223,19 +223,17 @@ def restore_model_state_from_ckpt_path(self, self.lightning_module.load_state_dict(ckpt['state_dict']) return ckpt, True - def increment_accumulated_grad_global_step(self, trainer) -> None: - trainer.global_step += 1 - - @contextlib.contextmanager - def model_parallel_context(self) -> Generator: + def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int: """ - Provide hook to create modules in a parallel aware context. This is useful for when we'd like to - shard the model instantly, which is useful for extremely large models which can save memory and - initialization time. + Provide a hook to count optimizer step calls. - Returns: Model parallel context. + Args: + total_batch_idx: Total number of batches seen for training + current_global_step: Current number of optimizer step calls + + Returns: New optimizer step calls """ - yield + return current_global_step + 1 def save_checkpoint(self, checkpoint: Dict[str, Any], filepath: str) -> None: """Save model/training states as a checkpoint file through state-dump and file-write. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 9d02b64f2b386..5f69d1a4828e1 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -771,7 +771,8 @@ def increment_accumulated_grad_global_step(self): # progress global step according to grads progress if num_accumulated_batches_reached or num_training_batches_reached: - self.trainer.training_type_plugin.increment_accumulated_grad_global_step(self.trainer) + self.trainer.global_step = self.trainer.accelerator.compute_new_global_step( + self.trainer.total_batch_idx, self.trainer.global_step) def _accumulated_batches_reached(self): return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0 diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index bf940e693d5e0..5c4de60263aef 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -63,23 +63,13 @@ def _compare_version(package: str, op, version) -> bool: return op(pkg_version, LooseVersion(version)) -def _is_kineto_available() -> bool: - _KINETO_AVAILABLE = False - if _TORCH_GREATER_EQUAL_1_8: - # kineto isn't available into pre 1.8 release - kineto_available_fx = getattr(torch.profiler, "kineto_available", None) - if kineto_available_fx: - _KINETO_AVAILABLE = kineto_available_fx() - return _KINETO_AVAILABLE - - _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _TORCH_LOWER_EQUAL_1_4 = _compare_version("torch", operator.le, "1.5.0") _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0") _TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0") _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0") -_KINETO_AVAILABLE = _is_kineto_available() +_KINETO_AVAILABLE = torch.profiler.kineto_available() if _TORCH_GREATER_EQUAL_1_8 else False _APEX_AVAILABLE = _module_available("apex.amp") _BOLTS_AVAILABLE = _module_available('pl_bolts') _DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _module_available('deepspeed') diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index d3a7784600690..5833120e1a40c 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,6 +1,7 @@ import json import os from typing import Any +from unittest.mock import call import pytest import torch @@ -9,6 +10,7 @@ from torch.optim import Optimizer from pytorch_lightning import LightningModule, seed_everything, Trainer +from pytorch_lightning import callbacks from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.metrics import Accuracy from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin @@ -19,6 +21,16 @@ from tests.helpers.runif import RunIf +class ModelParallelBoringModel(BoringModel): + + def __init__(self): + super().__init__() + self.linear = None + + def configure_sharded_model(self) -> None: + self.linear = torch.nn.Linear(32, 2) + + def test_deepspeed_lightning_module(tmpdir): """ Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly. @@ -185,7 +197,7 @@ def test_deepspeed_defaults(tmpdir): assert isinstance(plugin.config["zero_optimization"], dict) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True) def test_invalid_deepspeed_defaults_no_precision(tmpdir): """Test to ensure that using defaults, if precision is not set to 16, we throw an exception.""" model = BoringModel() @@ -228,24 +240,25 @@ def test_deepspeed_run_configure_optimizers(tmpdir): whilst using configure_optimizers for optimizers and schedulers. """ - class TestModel(BoringModel): + class TestCB(Callback): - def on_train_start(self) -> None: + def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer - assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) - assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD) - assert self.trainer.lr_schedulers == [] # DeepSpeed manages LR scheduler internally + assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) + assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) + assert trainer.lr_schedulers == [] # DeepSpeed manages LR scheduler internally # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler - assert isinstance(self.trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) + assert isinstance(trainer.model.lr_scheduler, torch.optim.lr_scheduler.StepLR) - model = TestModel() + model = BoringModel() trainer = Trainer( plugins=DeepSpeedPlugin(), # disable ZeRO so our optimizers are not wrapped default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, + callbacks=[TestCB()] ) trainer.fit(model) @@ -260,25 +273,26 @@ def test_deepspeed_config(tmpdir, deepspeed_zero_config): and saves the model weights to load correctly. """ - class TestModel(BoringModel): + class TestCB(Callback): - def on_train_start(self) -> None: + def on_train_start(self, trainer, pl_module) -> None: from deepspeed.runtime.lr_schedules import WarmupLR from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer - assert isinstance(self.trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) - assert isinstance(self.trainer.optimizers[0].optimizer, torch.optim.SGD) - assert self.trainer.lr_schedulers == [] # DeepSpeed manages LR scheduler internally + assert isinstance(trainer.optimizers[0], FP16_DeepSpeedZeroOptimizer) + assert isinstance(trainer.optimizers[0].optimizer, torch.optim.SGD) + assert trainer.lr_schedulers == [] # DeepSpeed manages LR scheduler internally # Ensure DeepSpeed engine has initialized with our optimizer/lr_scheduler - assert isinstance(self.trainer.model.lr_scheduler, WarmupLR) + assert isinstance(trainer.model.lr_scheduler, WarmupLR) - model = TestModel() + model = BoringModel() trainer = Trainer( plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], default_root_dir=tmpdir, gpus=1, fast_dev_run=True, precision=16, + callbacks=[TestCB()] ) trainer.fit(model) @@ -291,19 +305,19 @@ def on_train_start(self) -> None: def test_deepspeed_custom_precision_params(tmpdir): """Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.""" - class TestModel(BoringModel): + class TestCB(Callback): - def on_train_start(self) -> None: - assert self.trainer.training_type_plugin.config['fp16']['loss_scale'] == 10 - assert self.trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10 - assert self.trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10 - assert self.trainer.training_type_plugin.config['fp16']['hysteresis'] == 10 - assert self.trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10 + def on_train_start(self, trainer, pl_module) -> None: + assert trainer.training_type_plugin.config['fp16']['loss_scale'] == 10 + assert trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10 + assert trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10 + assert trainer.training_type_plugin.config['fp16']['hysteresis'] == 10 + assert trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10 raise SystemExit() - model = TestModel() + model = BoringModel() ds = DeepSpeedPlugin(loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10) - trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, gpus=1) + trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, amp_backend='native', gpus=1, callbacks=[TestCB()]) with pytest.raises(SystemExit): trainer.fit(model) @@ -356,7 +370,7 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config): """ model = BoringModel() trainer = Trainer( - plugins=[DeepSpeedPlugin()], + plugins=[DeepSpeedPlugin(zero_optimization=False, stage=2)], default_root_dir=tmpdir, gpus=2, fast_dev_run=True, @@ -368,16 +382,6 @@ def test_deepspeed_multigpu(tmpdir, deepspeed_config): _assert_save_model_is_equal(model, tmpdir, trainer) -class ModelParallelBoringModel(BoringModel): - - def __init__(self): - super().__init__() - self.linear = None - - def configure_sharded_model(self) -> None: - self.linear = torch.nn.Linear(32, 2) - - class ModelParallelClassificationModel(LightningModule): def __init__(self, lr: float = 0.01, num_blocks: int = 5): @@ -454,7 +458,8 @@ def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): trainer.fit(model) trainer.test(model) - _assert_save_model_is_equal(model, tmpdir, trainer) + # todo (tchaton) Currently load_from_checkpoint is not support for zero-v3 + # _assert_save_model_is_equal(model, tmpdir, trainer) @RunIf(min_gpus=2, deepspeed=True, special=True) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index aa5d65844a1c5..cc35cca97b114 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -32,6 +32,9 @@ linenos_arr=($linenos) blocklist='test_pytorch_profiler_nested_emit_nvtx' report='' +# replace debuggin token by anything to filter failing test. Reset to True at when committing. +DEBUGGING_TOKEN="" + for i in "${!files_arr[@]}"; do file=${files_arr[$i]} lineno=${linenos_arr[$i]} @@ -52,16 +55,23 @@ for i in "${!files_arr[@]}"; do break fi - # run the test - report+="Ran\t$file:$lineno::$test_name\n" - python ${defaults} "${file}::${test_name}" - break + if [[ $line == *$DEBUGGING_TOKEN* ]]; then + # run the test + report+="Ran\t$file:$lineno::$test_name\n" + python ${defaults} "${file}::${test_name}" + break + fi fi done < <(echo "$test_code") done nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +if [[ -n "${DEBUGGING_TOKEN}" ]]; + echo "DEBUGGING_TOKEN: $DEBUGGING_TOKEN should be empty" + then exit 1 +fi + # echo test report printf '=%.s' {1..80} printf "\n$report" From de5f358a3de6337ff49d533f3b89d6578578a322 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 12:29:16 +0100 Subject: [PATCH 37/60] resolve flake8 --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 -- .../plugins/training_type/training_type_plugin.py | 2 +- tests/plugins/test_deepspeed_plugin.py | 4 +--- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index a3fde3e448729..3b1b35d822844 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -17,7 +17,6 @@ import os from collections import OrderedDict from pathlib import Path -from types import SimpleNamespace from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import torch @@ -505,4 +504,3 @@ def compute_new_global_step(self, total_batch_idx: int, current_global_step: int if self._accumulated_batches_reached(total_batch_idx, ): current_global_step += 1 return current_global_step - diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 7c686ef140a05..22b8f43c28cd4 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -229,7 +229,7 @@ def compute_new_global_step(self, total_batch_idx: int, current_global_step: int Args: total_batch_idx: Total number of batches seen for training - current_global_step: Current number of optimizer step calls + current_global_step: Current number of optimizer step calls Returns: New optimizer step calls """ diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 5833120e1a40c..b2637a114a09c 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,7 +1,6 @@ import json import os from typing import Any -from unittest.mock import call import pytest import torch @@ -10,7 +9,6 @@ from torch.optim import Optimizer from pytorch_lightning import LightningModule, seed_everything, Trainer -from pytorch_lightning import callbacks from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.metrics import Accuracy from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin @@ -317,7 +315,7 @@ def on_train_start(self, trainer, pl_module) -> None: model = BoringModel() ds = DeepSpeedPlugin(loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10) - trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, amp_backend='native', gpus=1, callbacks=[TestCB()]) + trainer = Trainer(default_root_dir=tmpdir, plugins=[ds], precision=16, gpus=1, callbacks=[TestCB()]) with pytest.raises(SystemExit): trainer.fit(model) From 301b1aaeb6d139f1028abef1d79c36db4cbbd338 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 12:35:04 +0100 Subject: [PATCH 38/60] update --- .github/workflows/ci_test-full.yml | 1 - azure-pipelines.yml | 1 - requirements/extra.txt | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 0af812a0a172f..ec9e71c5b83b2 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -117,7 +117,6 @@ jobs: # pip uninstall -y horovod python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" - - name: Install dependencies env: # MAKEFLAGS: "-j2" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 43953cc6ffaf6..85664bac74b67 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,7 +63,6 @@ jobs: python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir - # install latest version of DeepSpeed - temporary solution until next release pip list displayName: 'Install dependencies' diff --git a/requirements/extra.txt b/requirements/extra.txt index 1175466a3df5e..cd6c5fc94fe6f 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -9,4 +9,4 @@ onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip -deepspeed \ No newline at end of file +deepspeed==0.3.13 From b9542ae0aa85dd1d4e6e728b2689b88b55a5a3c2 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 13:33:16 +0100 Subject: [PATCH 39/60] update --- tests/special_tests.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index cc35cca97b114..b82c90c8decfb 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -67,9 +67,10 @@ done nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx -if [[ -n "${DEBUGGING_TOKEN}" ]]; +if [[ -n ${DEBUGGING_TOKEN} ]]; +then echo "DEBUGGING_TOKEN: $DEBUGGING_TOKEN should be empty" - then exit 1 + exit 1 fi # echo test report From 48c09505ab981cd2b0a484ed03aafd766e7ccccc Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 13:37:41 +0100 Subject: [PATCH 40/60] update on comments --- dockers/base-cuda/Dockerfile | 2 +- pytorch_lightning/accelerators/accelerator.py | 6 +++--- pytorch_lightning/plugins/training_type/deepspeed.py | 11 ++++------- .../plugins/training_type/training_type_plugin.py | 2 +- pytorch_lightning/trainer/training_loop.py | 5 +++-- requirements/extra.txt | 2 +- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index d7c13e7560010..476ef75319b4b 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -114,7 +114,7 @@ RUN \ rm -rf apex RUN \ - pip install deepspeed==0.3.13 + pip install deepspeed>=0.3.13 RUN \ # Show what we have diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 37725e1c77f4b..569af875e6c64 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -441,7 +441,7 @@ def results(self) -> Any: return self.training_type_plugin.results @contextlib.contextmanager - def model_sharded_context(self) -> Generator: + def model_sharded_context(self) -> Generator[None, None, None]: """ Provide hook to create modules in a distributed aware context. This is useful for when we'd like to shard the model instantly - useful for extremely large models. Can save memory and @@ -512,5 +512,5 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: """ return self.training_type_plugin.setup_optimizers_in_pre_dispatch - def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int: - return self.training_type_plugin.compute_new_global_step(total_batch_idx, current_global_step) + def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + return self.training_type_plugin.update_global_step(total_batch_idx, current_global_step) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 3b1b35d822844..80a12fe181745 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -289,7 +289,7 @@ def _initialize_deepspeed_train(self, model): self.model = model @contextlib.contextmanager - def model_sharded_context(self) -> Generator: + def model_sharded_context(self) -> Generator[None, None, None]: if self.zero_stage_3: model_parallel_context = deepspeed.zero.Init(remote_device="cpu", pin_memory=True) else: @@ -494,13 +494,10 @@ def restore_model_state_from_ckpt_path(self, return client_state, False return super().restore_model_state_from_ckpt_path(ckpt_path, map_location=map_location) - def _accumulated_batches_reached(self, total_batch_idx: int) -> bool: - return total_batch_idx % self._original_accumulate_grad_batches == 0 - - def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: if self._original_accumulate_grad_batches is None: - return current_global_step + 1 + return super().update_global_step(total_batch_idx, current_global_step) else: - if self._accumulated_batches_reached(total_batch_idx, ): + if total_batch_idx % self._original_accumulate_grad_batches == 0: current_global_step += 1 return current_global_step diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 22b8f43c28cd4..d155ce018c77d 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -223,7 +223,7 @@ def restore_model_state_from_ckpt_path(self, self.lightning_module.load_state_dict(ckpt['state_dict']) return ckpt, True - def compute_new_global_step(self, total_batch_idx: int, current_global_step: int) -> int: + def update_global_step(self, total_batch_idx: int, current_global_step: int) -> int: """ Provide a hook to count optimizer step calls. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5f69d1a4828e1..4640343710f81 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -771,8 +771,9 @@ def increment_accumulated_grad_global_step(self): # progress global step according to grads progress if num_accumulated_batches_reached or num_training_batches_reached: - self.trainer.global_step = self.trainer.accelerator.compute_new_global_step( - self.trainer.total_batch_idx, self.trainer.global_step) + self.trainer.global_step = self.trainer.accelerator.update_global_step( + self.trainer.total_batch_idx, self.trainer.global_step + ) def _accumulated_batches_reached(self): return (self.trainer.batch_idx + 1) % self.trainer.accumulate_grad_batches == 0 diff --git a/requirements/extra.txt b/requirements/extra.txt index cd6c5fc94fe6f..cee1fd0eb07e1 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -9,4 +9,4 @@ onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip -deepspeed==0.3.13 +deepspeed>=0.3.13 From c2304071900669052b347a8be86a49da548866b7 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 30 Mar 2021 13:42:14 +0100 Subject: [PATCH 41/60] Push --- .github/workflows/events-nightly.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 5ad4396a006f7..91d509f193339 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -3,6 +3,7 @@ name: Nightly events # https://jasonet.co/posts/scheduled-actions/ # https://github.community/t/distinct-job-for-each-schedule/17811/2 on: + push: {} # fixme schedule: - cron: "0 0 * * *" # At the end of every day From 783265f338ee7d757ac17e451090244537ef1c9e Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 30 Mar 2021 13:42:34 +0100 Subject: [PATCH 42/60] pull --- .github/workflows/events-nightly.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 91d509f193339..5ad4396a006f7 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -3,7 +3,6 @@ name: Nightly events # https://jasonet.co/posts/scheduled-actions/ # https://github.community/t/distinct-job-for-each-schedule/17811/2 on: - push: {} # fixme schedule: - cron: "0 0 * * *" # At the end of every day From c8f79f97f43543f26aa9c04eb56d6c8f6db02d29 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 13:56:43 +0100 Subject: [PATCH 43/60] Update pytorch_lightning/plugins/training_type/deepspeed.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 80a12fe181745..06622a41193f7 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -166,7 +166,7 @@ def __init__( partition_activations: Enables partition activation when used with ZeRO stage 3. Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. - See https://www.deepspeed.ai/tutorials/megatron/#deepspeed-activation-checkpoints-optional + See `deepspeed tutorial `_ cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled From 61378debabeee499b4f73f8b370b71252e604846 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 13:56:53 +0100 Subject: [PATCH 44/60] Update pytorch_lightning/plugins/training_type/deepspeed.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 06622a41193f7..566461acdd4fd 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -173,7 +173,7 @@ def __init__( contiguous_memory_optimization: Copies partitioned activations so that they are contiguous in memory. Not supported by all models - synchronize_checkpoint_boundary: Insert ``torch.cuda.synchronize()`` at each checkpoint boundary. + synchronize_checkpoint_boundary: Insert :func:`torch.cuda.synchronize` at each checkpoint boundary. """ if not _DEEPSPEED_AVAILABLE: raise MisconfigurationException( From 45c9569d58cb2c1518d161e76163aee086108885 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 14:01:54 +0100 Subject: [PATCH 45/60] update --- pytorch_lightning/plugins/training_type/deepspeed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 566461acdd4fd..78ab9af3dc139 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -166,7 +166,8 @@ def __init__( partition_activations: Enables partition activation when used with ZeRO stage 3. Still requires you to wrap your forward functions in deepspeed.checkpointing.checkpoint. - See `deepspeed tutorial `_ + See `deepspeed tutorial + `_ cpu_checkpointing: Offloads partitioned activations to CPU if ``partition_activations`` is enabled From deb2ea265e53ba47e800bddf35d6db42e2a55525 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 30 Mar 2021 14:05:26 +0100 Subject: [PATCH 46/60] Apply suggestions from code review --- pytorch_lightning/plugins/training_type/deepspeed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 78ab9af3dc139..4dc3a7a4dc26c 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -38,6 +38,7 @@ def remove_module_hooks(model: torch.nn.Module) -> None: + # TODO: awaiting this feature to move upstream to DeepSpeed for module in model.modules(): module._backward_hooks = OrderedDict() module._is_full_backward_hook = None @@ -320,7 +321,7 @@ def _initialize_deepspeed_inference(self, model): ) optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() inference_config = { - # todo: this is required for DeepSpeed throughput timers + # todo: this is required for DeepSpeed throughput timers, so throughput timers will be incorrect 'train_micro_batch_size_per_gpu': 1, } if 'fp16' in self.config: From 122e91109f928f1868ef700b668b9b6e8bf093f7 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 30 Mar 2021 14:07:27 +0100 Subject: [PATCH 47/60] Swap to using world size defined by plugin --- pytorch_lightning/plugins/training_type/deepspeed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 4dc3a7a4dc26c..9d3b9650c5120 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -460,7 +460,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: filepath: write-target file's path weights_only: saving model weights only """ - if torch.distributed.get_world_size() > 1 and self.zero_stage_3: + if self.world_size > 1 and self.zero_stage_3: # Use deepspeed's internal checkpointing function to handle partitioned weights across processes # dump states as a checkpoint dictionary object save_dir = self._filepath_to_dir(filepath) @@ -474,7 +474,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: def restore_model_state_from_ckpt_path(self, ckpt_path: str, map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: - if torch.distributed.get_world_size() > 1: + if self.world_size > 1: from pytorch_lightning.trainer.states import TrainerState stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING save_dir = self._filepath_to_dir(ckpt_path) From dfb403b3d481472021b2f4c7214fe7ff08402a52 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 14:10:03 +0100 Subject: [PATCH 48/60] update --- pytorch_lightning/plugins/training_type/deepspeed.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 78ab9af3dc139..7df3213067491 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -37,6 +37,7 @@ import deepspeed +# todo (tchaton) use deepspeed version when merged def remove_module_hooks(model: torch.nn.Module) -> None: for module in model.modules(): module._backward_hooks = OrderedDict() @@ -320,7 +321,7 @@ def _initialize_deepspeed_inference(self, model): ) optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() inference_config = { - # todo: this is required for DeepSpeed throughput timers + # todo: this is required for DeepSpeed throughput timers, or throughput timers will be incorrect 'train_micro_batch_size_per_gpu': 1, } if 'fp16' in self.config: @@ -369,8 +370,15 @@ def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Calla self.model.step(**kwargs) def _handle_gradient_accumulation_steps(self): + """ + This functions overrides the trainer.accumulation_scheduler to generate + ``accumulate_grad_batches=1``. + Therefore, ``optimizer_step`` will be called on every batches seen + so DeepSpeed Engine handles the gradient accumulation logic internally. + """ if self.config.get("gradient_accumulation_steps") > 1: self._original_accumulate_grad_batches = self.lightning_module.trainer.accumulate_grad_batches + # todo (tchaton) Add support for accumulate_grad_batches being a dictionary. self.lightning_module.trainer.accumulation_scheduler = GradientAccumulationScheduler({0: 1}) else: self._original_accumulate_grad_batches = None From 066e0f05e74336ea7b0b641bafca198d576e6183 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 14:11:49 +0100 Subject: [PATCH 49/60] update todo --- pytorch_lightning/plugins/training_type/deepspeed.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index abba793842c85..f73a94474bdbd 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -37,9 +37,8 @@ import deepspeed -# todo (tchaton) use deepspeed version when merged def remove_module_hooks(model: torch.nn.Module) -> None: - # TODO: awaiting this feature to move upstream to DeepSpeed + # todo (tchaton) awaiting this feature to move upstream to DeepSpeed for module in model.modules(): module._backward_hooks = OrderedDict() module._is_full_backward_hook = None From d41284e6178d1a4f5793cdea5d2fc19b31aa98c9 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 30 Mar 2021 14:17:14 +0100 Subject: [PATCH 50/60] Remove deepspeed from extra, keep it in the base cuda docker install --- requirements/extra.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/extra.txt b/requirements/extra.txt index cee1fd0eb07e1..715916c4e36ac 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -9,4 +9,3 @@ onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip -deepspeed>=0.3.13 From 0c9836cf75d8fec34dca4d18ba520e7604b93bbe Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 30 Mar 2021 14:17:43 +0100 Subject: [PATCH 51/60] Push --- .github/workflows/events-nightly.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 5ad4396a006f7..91d509f193339 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -3,6 +3,7 @@ name: Nightly events # https://jasonet.co/posts/scheduled-actions/ # https://github.community/t/distinct-job-for-each-schedule/17811/2 on: + push: {} # fixme schedule: - cron: "0 0 * * *" # At the end of every day From d1c511ee049c6f26e40c90e631968ff036d13179 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 30 Mar 2021 14:18:01 +0100 Subject: [PATCH 52/60] pull --- .github/workflows/events-nightly.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 91d509f193339..5ad4396a006f7 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -3,7 +3,6 @@ name: Nightly events # https://jasonet.co/posts/scheduled-actions/ # https://github.community/t/distinct-job-for-each-schedule/17811/2 on: - push: {} # fixme schedule: - cron: "0 0 * * *" # At the end of every day From 67d31fa1122f8fa5d781215147556a3b92d0f8b1 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 14:33:17 +0100 Subject: [PATCH 53/60] update --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index f73a94474bdbd..ab1bd3d8b2405 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -17,6 +17,7 @@ import os from collections import OrderedDict from pathlib import Path +from types import SimpleNamespace from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import torch @@ -277,6 +278,7 @@ def _initialize_deepspeed_train(self, model): optimizer, lightning_scheduler, optimizer_frequencies = self._init_scheduler_optimizer() model_parameters = filter(lambda p: p.requires_grad, self.model.parameters()) model, optimizer, _, lr_scheduler = deepspeed.initialize( + args=SimpleNamespace(local_rank=self.local_rank), model=model, model_parameters=model_parameters, optimizer=optimizer, From 1740eed89b86aad2fb80124085299dc77c4c440f Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 14:46:52 +0100 Subject: [PATCH 54/60] update --- pytorch_lightning/plugins/training_type/deepspeed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index ab1bd3d8b2405..0f5bb16bab7fd 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -336,6 +336,7 @@ def _initialize_deepspeed_inference(self, model): # Remove all module hooks before initializing new model remove_module_hooks(model) model, _, _, _ = deepspeed.initialize( + args=SimpleNamespace(local_rank=self.local_rank), model=model, optimizer=optimizer, lr_scheduler=lightning_scheduler, From 300f3aa4cb61588beeed714363643efe0d41ad5d Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 30 Mar 2021 15:02:01 +0100 Subject: [PATCH 55/60] update --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 85664bac74b67..9d3cfdd2ac1ce 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,6 +63,7 @@ jobs: python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir + pip install --no-cache-dir git+https://github.com/microsoft/DeepSpeed.git pip list displayName: 'Install dependencies' From 40b1cc6c2505456d39d5e96bcb7f0108a962c753 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Tue, 30 Mar 2021 15:29:43 +0000 Subject: [PATCH 56/60] update --- azure-pipelines.yml | 1 - requirements/extra.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9d3cfdd2ac1ce..85664bac74b67 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,7 +63,6 @@ jobs: python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed pip install git+https://$(AUTH_TOKEN)@github.com/PyTorchLightning/lightning-dtrun.git@v0.0.2 --no-cache-dir - pip install --no-cache-dir git+https://github.com/microsoft/DeepSpeed.git pip list displayName: 'Install dependencies' diff --git a/requirements/extra.txt b/requirements/extra.txt index 715916c4e36ac..cee1fd0eb07e1 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -9,3 +9,4 @@ onnxruntime>=1.3.0 hydra-core>=1.0 # todo: when switch to standard package stream, drop `fairscale` from hard mocked docs libs https://github.com/PyTorchLightning/fairscale/archive/pl_1.2.0.zip +deepspeed>=0.3.13 From 603caf13bed6c5ec2e8a1e748ab21c9660f21242 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 30 Mar 2021 17:54:26 +0200 Subject: [PATCH 57/60] Minor changes --- .../plugins/training_type/deepspeed.py | 2 +- pytorch_lightning/utilities/imports.py | 3 ++- tests/special_tests.sh | 23 ++++++++----------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 0f5bb16bab7fd..10cce8d0e0182 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -456,7 +456,7 @@ def _create_default_config( } return cfg - def _filepath_to_dir(self, filepath: str): + def _filepath_to_dir(self, filepath: str) -> str: return os.path.dirname(filepath) @property diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 5c4de60263aef..001b9a67c5703 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -69,7 +69,7 @@ def _compare_version(package: str, op, version) -> bool: _TORCH_GREATER_EQUAL_1_6 = _compare_version("torch", operator.ge, "1.6.0") _TORCH_GREATER_EQUAL_1_7 = _compare_version("torch", operator.ge, "1.7.0") _TORCH_GREATER_EQUAL_1_8 = _compare_version("torch", operator.ge, "1.8.0") -_KINETO_AVAILABLE = torch.profiler.kineto_available() if _TORCH_GREATER_EQUAL_1_8 else False + _APEX_AVAILABLE = _module_available("apex.amp") _BOLTS_AVAILABLE = _module_available('pl_bolts') _DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _module_available('deepspeed') @@ -79,6 +79,7 @@ def _compare_version(package: str, op, version) -> bool: _HOROVOD_AVAILABLE = _module_available("horovod.torch") _HYDRA_AVAILABLE = _module_available("hydra") _HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental") +_KINETO_AVAILABLE = torch.profiler.kineto_available() if _TORCH_GREATER_EQUAL_1_8 else False _NATIVE_AMP_AVAILABLE = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast") _OMEGACONF_AVAILABLE = _module_available("omegaconf") _RPC_AVAILABLE = not _IS_WINDOWS and _module_available('torch.distributed.rpc') diff --git a/tests/special_tests.sh b/tests/special_tests.sh index b82c90c8decfb..cf81700291b8d 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -32,9 +32,6 @@ linenos_arr=($linenos) blocklist='test_pytorch_profiler_nested_emit_nvtx' report='' -# replace debuggin token by anything to filter failing test. Reset to True at when committing. -DEBUGGING_TOKEN="" - for i in "${!files_arr[@]}"; do file=${files_arr[$i]} lineno=${linenos_arr[$i]} @@ -55,24 +52,24 @@ for i in "${!files_arr[@]}"; do break fi - if [[ $line == *$DEBUGGING_TOKEN* ]]; then - # run the test - report+="Ran\t$file:$lineno::$test_name\n" - python ${defaults} "${file}::${test_name}" + # SPECIAL_PATTERN allows filtering the tests to run when debugging. + # use as `SPECIAL_PATTERN="foo_bar" ./special_tests.sh` to run only those + # test with `foo_bar` in their name + if [[ $line != *$SPECIAL_PATTERN* ]]; then + report+="Skipped\t$file:$lineno::$test_name\n" break fi + + # run the test + report+="Ran\t$file:$lineno::$test_name\n" + python ${defaults} "${file}::${test_name}" + break fi done < <(echo "$test_code") done nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx -if [[ -n ${DEBUGGING_TOKEN} ]]; -then - echo "DEBUGGING_TOKEN: $DEBUGGING_TOKEN should be empty" - exit 1 -fi - # echo test report printf '=%.s' {1..80} printf "\n$report" From 62f67e813d8e76685c819f84e9f97319c3772ef6 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Mar 2021 18:01:50 +0200 Subject: [PATCH 58/60] duplicate --- dockers/base-cuda/Dockerfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 476ef75319b4b..a3624f536a0f2 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -113,9 +113,6 @@ RUN \ pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ rm -rf apex -RUN \ - pip install deepspeed>=0.3.13 - RUN \ # Show what we have pip --version && \ From 5786c4b3eaeb3f9a007b5276eb3eb6fb612406dc Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Mar 2021 18:05:52 +0200 Subject: [PATCH 59/60] format --- pytorch_lightning/plugins/precision/double.py | 6 +-- .../plugins/training_type/deepspeed.py | 19 ++++++--- pytorch_lightning/utilities/__init__.py | 1 + tests/models/test_hooks.py | 42 +++++-------------- tests/plugins/test_double_plugin.py | 6 +-- 5 files changed, 27 insertions(+), 47 deletions(-) diff --git a/pytorch_lightning/plugins/precision/double.py b/pytorch_lightning/plugins/precision/double.py index 4720f0f874fd0..6e37c79f2b163 100644 --- a/pytorch_lightning/plugins/precision/double.py +++ b/pytorch_lightning/plugins/precision/double.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from functools import wraps -from typing import Any, Sequence, Tuple, TYPE_CHECKING, List +from typing import Any, List, Sequence, Tuple, TYPE_CHECKING import torch @@ -44,9 +44,7 @@ def _to_double_precision(data: torch.Tensor) -> torch.Tensor: @staticmethod def _move_float_tensors_to_double(collection: Any) -> Any: - return apply_to_collection( - collection, torch.Tensor, function=_DoublePrecisionPatch._to_double_precision - ) + return apply_to_collection(collection, torch.Tensor, function=_DoublePrecisionPatch._to_double_precision) @classmethod def patch(cls, model: 'Module', method_name: str) -> '_DoublePrecisionPatch': diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 10cce8d0e0182..7011b81a8d131 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -436,9 +436,14 @@ def _format_precision_config(self): raise MisconfigurationException("To use DeepSpeed ZeRO Optimization, you must set precision=16.") def _create_default_config( - self, zero_optimization: bool, zero_allow_untested_optimizer: bool, partition_activations: bool, - cpu_checkpointing: bool, contiguous_memory_optimization: bool, synchronize_checkpoint_boundary: bool, - **zero_kwargs + self, + zero_optimization: bool, + zero_allow_untested_optimizer: bool, + partition_activations: bool, + cpu_checkpointing: bool, + contiguous_memory_optimization: bool, + synchronize_checkpoint_boundary: bool, + **zero_kwargs, ) -> Dict: cfg = { 'activation_checkpointing': { @@ -481,9 +486,11 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: else: super().save_checkpoint(checkpoint, filepath) - def restore_model_state_from_ckpt_path(self, - ckpt_path: str, - map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: + def restore_model_state_from_ckpt_path( + self, + ckpt_path: str, + map_location=lambda storage, loc: storage, + ) -> Tuple[Dict, bool]: if self.world_size > 1: from pytorch_lightning.trainer.states import TrainerState stage_is_fit = self.lightning_module.trainer.state == TrainerState.FITTING diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 03981b0042eac..28cb05bc06f2d 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -14,6 +14,7 @@ """General utilities""" import numpy + from pytorch_lightning.utilities.apply_func import move_data_to_device # noqa: F401 from pytorch_lightning.utilities.distributed import ( # noqa: F401 AllGatherGrad, diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 1d55d4a5a63b7..57af82ccc3e08 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -19,7 +19,7 @@ from pytorch_lightning import Callback, Trainer from pytorch_lightning.trainer.states import TrainerState -from tests.helpers import BoringModel, RandomDataset, BoringDataModule +from tests.helpers import BoringDataModule, BoringModel, RandomDataset from tests.helpers.runif import RunIf @@ -515,6 +515,7 @@ def test_trainer_datamodule_hook_system(tmpdir): """Test the LightningDataModule hook system.""" class HookedDataModule(BoringDataModule): + def __init__(self): super().__init__() self.called = [] @@ -574,23 +575,10 @@ def on_after_batch_transfer(self, *args, **kwargs): trainer.fit(model, datamodule=dm) expected = [ - 'prepare_data', - 'setup_fit', - 'val_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'train_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'val_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', + 'prepare_data', 'setup_fit', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', + 'on_after_batch_transfer', 'train_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', + 'on_after_batch_transfer', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', + 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', 'on_after_batch_transfer', 'teardown_fit' ] assert dm.called == expected @@ -599,13 +587,8 @@ def on_after_batch_transfer(self, *args, **kwargs): trainer.validate(model, datamodule=dm, verbose=False) expected = [ - 'prepare_data', - 'setup_validate', - 'val_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'teardown_validate' + 'prepare_data', 'setup_validate', 'val_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', + 'on_after_batch_transfer', 'teardown_validate' ] assert dm.called == expected @@ -613,12 +596,7 @@ def on_after_batch_transfer(self, *args, **kwargs): trainer.test(model, datamodule=dm, verbose=False) expected = [ - 'prepare_data', - 'setup_test', - 'test_dataloader', - 'on_before_batch_transfer', - 'transfer_batch_to_device', - 'on_after_batch_transfer', - 'teardown_test' + 'prepare_data', 'setup_test', 'test_dataloader', 'on_before_batch_transfer', 'transfer_batch_to_device', + 'on_after_batch_transfer', 'teardown_test' ] assert dm.called == expected diff --git a/tests/plugins/test_double_plugin.py b/tests/plugins/test_double_plugin.py index f089b1c23149e..175ca5ecaba6b 100644 --- a/tests/plugins/test_double_plugin.py +++ b/tests/plugins/test_double_plugin.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pytest - import torch from torch.utils.data import DataLoader, Dataset @@ -107,10 +106,7 @@ def predict_dataloader(self): return DataLoader(RandomDataset(32, 64)) -@pytest.mark.parametrize( - 'boring_model', - (DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward) -) +@pytest.mark.parametrize('boring_model', (DoublePrecisionBoringModel, DoublePrecisionBoringModelNoForward)) def test_double_precision(tmpdir, boring_model): model = boring_model() original_training_step = model.training_step From 83e1343dc9ebdceb33c626698ac541cddf1cf5e7 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 30 Mar 2021 18:19:04 +0200 Subject: [PATCH 60/60] format2 --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- .../plugins/training_type/training_type_plugin.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 7011b81a8d131..3dc52b60055d8 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -489,7 +489,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None: def restore_model_state_from_ckpt_path( self, ckpt_path: str, - map_location=lambda storage, loc: storage, + map_location: Callable = lambda storage, loc: storage, ) -> Tuple[Dict, bool]: if self.world_size > 1: from pytorch_lightning.trainer.states import TrainerState diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index d155ce018c77d..01c23504b7773 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -198,9 +198,11 @@ def setup_optimizers_in_pre_dispatch(self) -> bool: """ return False - def restore_model_state_from_ckpt_path(self, - ckpt_path: str, - map_location=lambda storage, loc: storage) -> Tuple[Dict, bool]: + def restore_model_state_from_ckpt_path( + self, + ckpt_path: str, + map_location: Callable = lambda storage, loc: storage, + ) -> Tuple[Dict, bool]: """ This function is used to load and restore the model state.