From 10b16dbfabcca473c41249ccaaa1afefb3f72f4d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 12 May 2020 06:54:23 -0400 Subject: [PATCH] made ddp the default if no backend specified with multiple GPUs (#1789) * made ddp the default if no backend specified with multiple GPUs * fix * spawn Co-authored-by: Jirka --- docs/source/multi_gpu.rst | 2 ++ pytorch_lightning/trainer/distrib_data_parallel.py | 4 ++-- tests/models/test_gpu.py | 1 + tests/trainer/test_trainer.py | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 9e32b2b0bba96..b7ebcce15687a 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -132,6 +132,8 @@ Lightning allows multiple ways of training - Horovod (`distributed_backend='horovod'`) (multi-machine, multi-gpu, configured at runtime) - TPUs (`num_tpu_cores=8|x`) (tpu or TPU pod) +.. note:: If you request multiple GPUs without setting a mode, ddp will be automatically used. + Data Parallel (dp) ^^^^^^^^^^^^^^^^^^ `DataParallel `_ splits a batch across k GPUs. That is, if you have a batch of 32 and use dp with 2 gpus, diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 4bf0c7ff4d56f..bd97d5ca339b0 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -203,8 +203,8 @@ def set_distributed_mode(self, distributed_backend): elif self.num_gpus > 1: rank_zero_warn('You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' - ' Setting distributed_backend=dp for you.') - self.use_dp = True + ' Setting distributed_backend=ddp for you.') + self.use_ddp = True elif distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6eafc19d863ee..f75b0a1f1a582 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -130,6 +130,7 @@ def assert_pred_same(): trainer.fit(model) +@pytest.mark.spawn @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 40783b1ad84e2..e6cdc65338f2c 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -712,7 +712,7 @@ def test_gpu_choice(tmpdir): ), pytest.param( dict(distributed_backend=None, gpus=2), - dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), pytest.param(