From 41aeb7713422d34b2db1463d9cf6168637cf5bc8 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 2 Mar 2021 11:56:21 +0000 Subject: [PATCH 1/6] Ensure we check deepspeed/sharded in multinode --- .../connectors/accelerator_connector.py | 10 +++++----- .../test_accelerator_connector.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 30bfbe2d963db..e60cb5abd0ab2 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -536,12 +536,12 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.distributed_backend == "horovod": self._set_horovod_backend() - # throw error to force user ddp or ddp2 choice - _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) - if (self.num_nodes > 1 and self._distrib_type not in _ddp): + using_valid_distributed = self.use_ddp or self.use_ddp2 + if self.num_nodes > 1 and not using_valid_distributed: + # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( - 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' - 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' + 'Your chosen distributed type does not support num_nodes > 1. ' + 'Please set accelerator=ddp or accelerator=ddp2.' ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 50c9ccd47dfed..42eaa0089a37f 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -28,6 +28,8 @@ DDPPlugin, DDPShardedPlugin, DDPSpawnPlugin, + DDPSpawnShardedPlugin, + DeepSpeedPlugin, PrecisionPlugin, SingleDevicePlugin, ) @@ -415,3 +417,20 @@ def test_plugin_accelerator_choice(accelerator, plugin): trainer = Trainer(plugins=plugin, num_processes=2) assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) + + +@pytest.mark.parametrize( + ["accelerator", "plugin"], + [('ddp', DDPPlugin), ('ddp_spawn', DDPSpawnPlugin), ('ddp_sharded', DDPShardedPlugin), + ('ddp_sharded_spawn', DDPSpawnShardedPlugin), ('deepspeed', DeepSpeedPlugin)], +) +@mock.patch('torch.cuda.is_available', return_value=True) +@mock.patch('torch.cuda.device_count', return_value=2) +def test_accelerator_choice_multi_node_gpu(mock_available, mock_device_count, accelerator, plugin, tmpdir): + trainer = Trainer( + accelerator=accelerator, + default_root_dir=tmpdir, + num_nodes=2, + gpus=2, + ) + assert isinstance(trainer.training_type_plugin, plugin) From d6a56c73c9d6f8252c7280cf22fb6447f3087be0 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 2 Mar 2021 11:59:22 +0000 Subject: [PATCH 2/6] Add CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d98b646d0e6f1..502da63a4903f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275)) +- Fix error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) + + ## [1.2.1] - 2021-02-23 ### Fixed From 9f0300803f9e3c631e7b23734f8658d16bd36622 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 2 Mar 2021 11:59:32 +0000 Subject: [PATCH 3/6] Add CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 502da63a4903f..679fa375bff81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,7 +86,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275)) -- Fix error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) +- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) ## [1.2.1] - 2021-02-23 From a9ae51967ddfafcf719fe68a27f629155d506bcb Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 2 Mar 2021 12:44:39 +0000 Subject: [PATCH 4/6] Drop mock, use actual multi-gpu node --- tests/accelerators/test_accelerator_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 42eaa0089a37f..949e577f88bb3 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -34,6 +34,7 @@ SingleDevicePlugin, ) from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment +from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf @@ -424,9 +425,8 @@ def test_plugin_accelerator_choice(accelerator, plugin): [('ddp', DDPPlugin), ('ddp_spawn', DDPSpawnPlugin), ('ddp_sharded', DDPShardedPlugin), ('ddp_sharded_spawn', DDPSpawnShardedPlugin), ('deepspeed', DeepSpeedPlugin)], ) -@mock.patch('torch.cuda.is_available', return_value=True) -@mock.patch('torch.cuda.device_count', return_value=2) -def test_accelerator_choice_multi_node_gpu(mock_available, mock_device_count, accelerator, plugin, tmpdir): +@RunIf(pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available."), min_gpus=2, skip_windows=True) +def test_accelerator_choice_multi_node_gpu(accelerator, plugin, tmpdir): trainer = Trainer( accelerator=accelerator, default_root_dir=tmpdir, From 38d7f8783f694253cc63d203d84d36e9a3311d25 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 2 Mar 2021 13:55:55 +0100 Subject: [PATCH 5/6] Address comment --- .../accelerators/test_accelerator_connector.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 949e577f88bb3..31bbb8a0b878d 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -420,12 +420,18 @@ def test_plugin_accelerator_choice(accelerator, plugin): assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) -@pytest.mark.parametrize( - ["accelerator", "plugin"], - [('ddp', DDPPlugin), ('ddp_spawn', DDPSpawnPlugin), ('ddp_sharded', DDPShardedPlugin), - ('ddp_sharded_spawn', DDPSpawnShardedPlugin), ('deepspeed', DeepSpeedPlugin)], -) -@RunIf(pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available."), min_gpus=2, skip_windows=True) +@pytest.mark.parametrize(["accelerator", "plugin"], [ + ('ddp', DDPPlugin), + ('ddp_spawn', DDPSpawnPlugin), + ('ddp_sharded', DDPShardedPlugin), + ('ddp_sharded_spawn', DDPSpawnShardedPlugin), + pytest.param( + 'deepspeed', + DeepSpeedPlugin, + marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.") + ), +]) +@RunIf(min_gpus=2, skip_windows=True) def test_accelerator_choice_multi_node_gpu(accelerator, plugin, tmpdir): trainer = Trainer( accelerator=accelerator, From 4e6396ab4055e0aa2aa94919dbc894ffb0a1008f Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 2 Mar 2021 13:01:23 +0000 Subject: [PATCH 6/6] Add back mock --- tests/accelerators/test_accelerator_connector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 31bbb8a0b878d..cd2b3041e7673 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -431,8 +431,9 @@ def test_plugin_accelerator_choice(accelerator, plugin): marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.") ), ]) -@RunIf(min_gpus=2, skip_windows=True) -def test_accelerator_choice_multi_node_gpu(accelerator, plugin, tmpdir): +@mock.patch('torch.cuda.is_available', return_value=True) +@mock.patch('torch.cuda.device_count', return_value=2) +def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir): trainer = Trainer( accelerator=accelerator, default_root_dir=tmpdir,