From 2950f669834506f8e5845b318b0f25d52d19e331 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Fri, 1 May 2020 11:13:35 -0700 Subject: [PATCH] Fix Horovod distributed backend to set the root_gpu property (#1669) * params * drop acc * Fix Horovod distributed backend to set the root_gpu * Fixed test * Fixed tests * Fixed lint * Set root_gpu during initialization * chlog Co-authored-by: Jirka --- CHANGELOG.md | 4 +++- .../trainer/distrib_data_parallel.py | 16 ++++++++++++---- pytorch_lightning/trainer/distrib_parts.py | 8 +++----- tests/models/data/horovod/train_default_model.py | 8 +++++++- tests/models/test_horovod.py | 7 +++++-- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f67e85a452ff9..94675a20111c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,10 +18,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed broken link in PR template ([#1675](https://github.com/PyTorchLightning/pytorch-lightning/pull/1675)) - Fixed ModelCheckpoint not None checking filepath ([1654](https://github.com/PyTorchLightning/pytorch-lightning/pull/1654)) + - Trainer now calls `on_load_checkpoint()` when resuming from a checkpoint ([1666](https://github.com/PyTorchLightning/pytorch-lightning/pull/1666)) +- Fixed Horovod distributed backend to set the `root_gpu` property ([#1669](https://github.com/PyTorchLightning/pytorch-lightning/pull/1669)) + ## [0.7.5] - 2020-04-27 diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 56c7bae8ec6a7..8651dd5c1b5a0 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -194,8 +194,7 @@ def set_distributed_mode(self, distributed_backend): if distributed_backend is None: if self.has_horovodrun(): - self.check_horovod() - self.use_horovod = True + self._set_horovod_backend() elif self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu @@ -235,8 +234,7 @@ def set_distributed_mode(self, distributed_backend): self.data_parallel_device_ids = None self.on_gpu = False elif distributed_backend == 'horovod': - self.check_horovod() - self.use_horovod = True + self._set_horovod_backend() # throw error to force user ddp or ddp2 choice if self.num_nodes > 1 and not (self.use_ddp2 or self.use_ddp): @@ -421,6 +419,16 @@ def resolve_root_node_address(self, root_node): return root_node + def _set_horovod_backend(self): + self.check_horovod() + self.use_horovod = True + + # Initialize Horovod to get rank / size info + hvd.init() + if self.on_gpu: + # Horovod assigns one local GPU per process + self.root_gpu = hvd.local_rank() + def check_horovod(self): """Raises a `MisconfigurationException` if the Trainer is not configured correctly for Horovod.""" if not HOROVOD_AVAILABLE: diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index db4e132c0b445..a9f4b6114522e 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -570,13 +570,11 @@ def dp_train(self, model): model.forward = model_autocast_original_forward def horovod_train(self, model): - # Horovod: initialize library - hvd.init() - if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank - torch.cuda.set_device(hvd.local_rank()) - model.cuda(hvd.local_rank()) + assert self.root_gpu == hvd.local_rank() + torch.cuda.set_device(self.root_gpu) + model.cuda(self.root_gpu) # Only show progress bar from the first worker self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank() == 0 else 0 diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 6c11e2ca5e755..3410cdc1d5051 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -27,12 +27,14 @@ PATH_ROOT = os.path.join(PATH_HERE, '..', '..', '..', '..') sys.path.insert(0, os.path.abspath(PATH_ROOT)) +from pytorch_lightning import Trainer # noqa: E402 from pytorch_lightning.callbacks import ModelCheckpoint # noqa: E402 import tests.base.utils as tutils # noqa: E402 parser = argparse.ArgumentParser() parser.add_argument('--trainer-options', required=True) +parser.add_argument('--on-gpu', action='store_true', default=False) def run_test_from_config(trainer_options): @@ -44,11 +46,15 @@ def run_test_from_config(trainer_options): trainer_options['checkpoint_callback'] = ModelCheckpoint(ckpt_path) model, hparams = tutils.get_default_model() - tutils.run_model_test(trainer_options, model, version=0, with_hpc=False) + tutils.run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 + if args.on_gpu: + # Test the root_gpu property + assert Trainer(gpus=1, distributed_backend='horovod', max_epochs=1).root_gpu == hvd.local_rank() + if __name__ == "__main__": args = parser.parse_args() diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index c4bcb4b81b995..21a90c191579b 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -38,10 +38,12 @@ def _nccl_available(): return False -def _run_horovod(trainer_options): +def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" cmdline = ['horovodrun', '-np', '2', sys.executable, TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options))] + if on_gpu: + cmdline += ['--on-gpu'] exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 @@ -93,7 +95,7 @@ def test_horovod_multi_gpu(tmpdir): gpus=1, distributed_backend='horovod' ) - _run_horovod(trainer_options) + _run_horovod(trainer_options, on_gpu=True) @pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8") @@ -159,5 +161,6 @@ def get_model_params(model): def get_optimizer_params(optimizer): return set([p for group in optimizer.param_groups for p in group.get('params', [])]) + assert get_model_params(model.generator) != get_model_params(model.discriminator) assert get_model_params(model.generator) == get_optimizer_params(trainer.optimizers[0]) assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1])