diff --git a/CHANGELOG.md b/CHANGELOG.md index d98b646d0e6f1..679fa375bff81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275)) +- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) + + ## [1.2.1] - 2021-02-23 ### Fixed diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 30bfbe2d963db..e60cb5abd0ab2 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -536,12 +536,12 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None): if self.distributed_backend == "horovod": self._set_horovod_backend() - # throw error to force user ddp or ddp2 choice - _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2) - if (self.num_nodes > 1 and self._distrib_type not in _ddp): + using_valid_distributed = self.use_ddp or self.use_ddp2 + if self.num_nodes > 1 and not using_valid_distributed: + # throw error to force user to choose a supported distributed type such as ddp or ddp2 raise MisconfigurationException( - 'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. ' - 'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`' + 'Your chosen distributed type does not support num_nodes > 1. ' + 'Please set accelerator=ddp or accelerator=ddp2.' ) rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}') diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py index 50c9ccd47dfed..cd2b3041e7673 100644 --- a/tests/accelerators/test_accelerator_connector.py +++ b/tests/accelerators/test_accelerator_connector.py @@ -28,10 +28,13 @@ DDPPlugin, DDPShardedPlugin, DDPSpawnPlugin, + DDPSpawnShardedPlugin, + DeepSpeedPlugin, PrecisionPlugin, SingleDevicePlugin, ) from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment +from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf @@ -415,3 +418,26 @@ def test_plugin_accelerator_choice(accelerator, plugin): trainer = Trainer(plugins=plugin, num_processes=2) assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin) + + +@pytest.mark.parametrize(["accelerator", "plugin"], [ + ('ddp', DDPPlugin), + ('ddp_spawn', DDPSpawnPlugin), + ('ddp_sharded', DDPShardedPlugin), + ('ddp_sharded_spawn', DDPSpawnShardedPlugin), + pytest.param( + 'deepspeed', + DeepSpeedPlugin, + marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.") + ), +]) +@mock.patch('torch.cuda.is_available', return_value=True) +@mock.patch('torch.cuda.device_count', return_value=2) +def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir): + trainer = Trainer( + accelerator=accelerator, + default_root_dir=tmpdir, + num_nodes=2, + gpus=2, + ) + assert isinstance(trainer.training_type_plugin, plugin)