From c5e9d67d16aa4a86e5be51700f4c4bbf3154dc2b Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 2 Mar 2021 13:36:18 +0000 Subject: [PATCH] [fix] Ensure we check deepspeed/sharded in multinode DDP (#6297) * Ensure we check deepspeed/sharded in multinode * Add CHANGELOG.md * Add CHANGELOG.md * Drop mock, use actual multi-gpu node --- CHANGELOG.md | 3 +++ .../connectors/accelerator_connector.py | 10 +++---- .../test_accelerator_connector.py | 27 +++++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dc186ba3630c..21067ea7453bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Prevent `WandbLogger` from dropping values ([#5931](https://github.com/PyTorchLightning/pytorch-lightning/pull/5931)) +- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) + + ## [1.2.1] - 2021-02-23 ### Fixed diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7021081d6cc90..fb6a1d4ab8442 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -531,12 +531,12 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.distributed_backend == "horovod": self._set_horovod_backend() - # throw error to force user ddp or ddp2 choice - _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) - if (self.num_nodes > 1 and self._distrib_type not in _ddp): + using_valid_distributed = self.use_ddp or self.use_ddp2 + if self.num_nodes > 1 and not using_valid_distributed: + # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( - 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' - 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' + 'Your chosen distributed type does not support num_nodes > 1. ' + 'Please set accelerator=ddp or accelerator=ddp2.' ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 82b631807c8e9..fa02fe819ae21 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -28,10 +28,14 @@ DDPPlugin, DDPShardedPlugin, DDPSpawnPlugin, + DDPSpawnShardedPlugin, + DeepSpeedPlugin, PrecisionPlugin, SingleDevicePlugin, ) from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment +from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel @@ -400,3 +404,26 @@ def test_plugin_accelerator_choice(accelerator, plugin): trainer = Trainer(plugins=plugin, num_processes=2) assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) + + +@pytest.mark.parametrize(["accelerator", "plugin"], [ + ('ddp', DDPPlugin), + ('ddp_spawn', DDPSpawnPlugin), + ('ddp_sharded', DDPShardedPlugin), + ('ddp_sharded_spawn', DDPSpawnShardedPlugin), + pytest.param( + 'deepspeed', + DeepSpeedPlugin, + marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.") + ), +]) +@mock.patch('torch.cuda.is_available', return_value=True) +@mock.patch('torch.cuda.device_count', return_value=2) +def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir): + trainer = Trainer( + accelerator=accelerator, + default_root_dir=tmpdir, + num_nodes=2, + gpus=2, + ) + assert isinstance(trainer.training_type_plugin, plugin)