From 29f3a2a60ef24328c3c126614a7ba20a39c2fa2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 15 May 2020 00:39:20 +0200 Subject: [PATCH 01/38] refactor and added hook variant a variant b add test revert rename add changelog docs --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 321319d6a216c..b30c0f49253f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,6 +94,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added using `store_true` for bool args ([#1822](https://github.com/PyTorchLightning/pytorch-lightning/pull/1822), [#1842](https://github.com/PyTorchLightning/pytorch-lightning/pull/1842)) - Added dummy logger for internally disabling logging for some features ([#1836](https://github.com/PyTorchLightning/pytorch-lightning/pull/1836)) +- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)). + ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) From 6727067e3986fa72886fed5f052ce40b3f6c821f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 20 May 2020 20:50:48 +0200 Subject: [PATCH 02/38] move changelog entry to top --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b30c0f49253f6..321319d6a216c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,8 +94,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added using `store_true` for bool args ([#1822](https://github.com/PyTorchLightning/pytorch-lightning/pull/1822), [#1842](https://github.com/PyTorchLightning/pytorch-lightning/pull/1842)) - Added dummy logger for internally disabling logging for some features ([#1836](https://github.com/PyTorchLightning/pytorch-lightning/pull/1836)) -- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)). - ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) From 31d60def5e140438d41b1972f36c240a6e418e3d Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 04:17:00 +0100 Subject: [PATCH 03/38] First attempt at auto-moving data for inference --- pytorch_lightning/core/lightning.py | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 7632822e462c3..1a98c514a552c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -102,6 +102,51 @@ def forward(self, x): if self.trainer.proc_rank == 0: print(*args, **kwargs) + # Note this is almost identical to distrib_parts.TrainerDPMixin.__transfer_data_to_device + def __transfer_data_to_device(self, batch, device): + if device == 'tpu' and XLA_AVAILABLE: + # base case: object can be directly moved using `to` + if callable(getattr(batch, 'to', None)): + return batch.to(xm.xla_device()) + + if device == 'gpu': + # base case: object can be directly moved using `cuda` or `to` + if callable(getattr(batch, 'cuda', None)): + return batch.to(device=device) + + if callable(getattr(batch, 'to', None)): + return batch.to(device=device) + + # when list + if isinstance(batch, list): + for i, x in enumerate(batch): + batch[i] = self.__transfer_data_to_device(x, device) + return batch + + # when tuple + if isinstance(batch, tuple): + batch = list(batch) + for i, x in enumerate(batch): + batch[i] = self.__transfer_data_to_device(x, device) + return tuple(batch) + + # when dict + if isinstance(batch, dict): + for k, v in batch.items(): + batch[k] = self.__transfer_data_to_device(v, device) + + return batch + + # nothing matches, return the value as is without transform + return batch + + def __call__(self, *input_data, **kwargs): + device = [p.device for p in self.parameters()] + assert all([device[0] == d for d in device]), 'All parameters must be on same device' + input_data = self.__transfer_data_to_device(input_data, device[0]) + kwargs = self.__transfer_data_to_device(kwargs, device[0]) + return super(LightningModule, self).__call__(*input_data, *kwargs) + @abstractmethod def forward(self, *args, **kwargs): r""" From 678e5804c03730180cb1e740c967aca6560dc7d9 Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 04:37:24 +0100 Subject: [PATCH 04/38] Correct my copypaste errors --- pytorch_lightning/core/lightning.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 1a98c514a552c..54cf71a2e54b4 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -104,18 +104,12 @@ def forward(self, x): # Note this is almost identical to distrib_parts.TrainerDPMixin.__transfer_data_to_device def __transfer_data_to_device(self, batch, device): - if device == 'tpu' and XLA_AVAILABLE: - # base case: object can be directly moved using `to` - if callable(getattr(batch, 'to', None)): - return batch.to(xm.xla_device()) - - if device == 'gpu': - # base case: object can be directly moved using `cuda` or `to` - if callable(getattr(batch, 'cuda', None)): - return batch.to(device=device) - - if callable(getattr(batch, 'to', None)): - return batch.to(device=device) + # base case: object can be directly moved using `cuda` or `to` + if callable(getattr(batch, 'cuda', None)): + return batch.cuda(device=device) + + if callable(getattr(batch, 'to', None)): + return batch.to(device=device) # when list if isinstance(batch, list): From e5e60a46baaab938b951eedb65226f67493a74ad Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 04:50:29 +0100 Subject: [PATCH 05/38] Correct for if device is CPU --- pl_examples/basic_examples/gpu_template.py | 6 +++++- pytorch_lightning/core/lightning.py | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index c5fa94a3cf140..690c2e6418dbe 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -38,7 +38,11 @@ def main(hparams): # ------------------------ # 3 START TRAINING # ------------------------ - trainer.fit(model) + #trainer.fit(model) + model = model.cuda(0) + x = torch.randn(2, 784) + print(x.device) + print(model(x).device) if __name__ == '__main__': diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 54cf71a2e54b4..06b866776731c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -103,8 +103,13 @@ def forward(self, x): print(*args, **kwargs) # Note this is almost identical to distrib_parts.TrainerDPMixin.__transfer_data_to_device + # Only works for GPU and not TPU for now def __transfer_data_to_device(self, batch, device): - # base case: object can be directly moved using `cuda` or `to` + # base case: nothing to do + if torch.is_tensor(batch) and batch.device == device: + return batch + + # object can be directly moved using `cuda` or `to` if callable(getattr(batch, 'cuda', None)): return batch.cuda(device=device) From e4dfbaf073c85b9cb047bd6fd47bc65fa2f8e970 Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 04:53:06 +0100 Subject: [PATCH 06/38] Get rid of the WIP code I accidentally added --- pl_examples/basic_examples/gpu_template.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index 690c2e6418dbe..c5fa94a3cf140 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -38,11 +38,7 @@ def main(hparams): # ------------------------ # 3 START TRAINING # ------------------------ - #trainer.fit(model) - model = model.cuda(0) - x = torch.randn(2, 784) - print(x.device) - print(model(x).device) + trainer.fit(model) if __name__ == '__main__': From 0ba04baa2d285a363db8ca5adac08b79cae6436e Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 05:37:04 +0100 Subject: [PATCH 07/38] Add tests --- tests/models/test_gpu.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 80249a727ccbb..baca34f70e0b2 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -97,6 +97,24 @@ def test_multi_gpu_none_backend(tmpdir): tutils.run_model_test(trainer_options, model) +def test_auto_move_data(tmpdir): + """Make sure auto moving data works""" + + tutils.reset_seed() + tutils.set_random_master_port() + + model, hparams = tutils.get_default_model() + model = model.cuda(0) + model.prepare_data() + loader = model.train_dataloader() + correct_device = [p.device for p in model.parameters()] + assert all([correct_device[0] == d for d in correct_device]), 'All parameters must be on same device' + correct_device = correct_device[0] + for x, y in loader: + x = x.view(x.size(0), -1) + assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" + + @pytest.fixture def mocked_device_count(monkeypatch): def device_count(): From f50d1ea34701f21a0741bc394893b4ffbe9c943c Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 05:45:07 +0100 Subject: [PATCH 08/38] Make tests more foolproof --- pytorch_lightning/core/lightning.py | 4 +++- tests/models/test_cpu.py | 33 +++++++++++++++++++++++++++++ tests/models/test_gpu.py | 2 ++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 06b866776731c..f4c248e1057c6 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -110,7 +110,7 @@ def __transfer_data_to_device(self, batch, device): return batch # object can be directly moved using `cuda` or `to` - if callable(getattr(batch, 'cuda', None)): + if callable(getattr(batch, 'cuda', None)) and device.type == 'cuda': return batch.cuda(device=device) if callable(getattr(batch, 'to', None)): @@ -151,6 +151,8 @@ def forward(self, *args, **kwargs): r""" Same as :meth:`torch.nn.Module.forward()`, however in Lightning you want this to define the operations you want to use for prediction (i.e.: on a server or as a feature extractor). + LightningModule will also automatically copy data to the same device as the model if the model + is on CPU or GPU Normally you'd call ``self()`` from your :meth:`training_step` method. This makes it easy to write a complex system for training with the outputs diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 051db81d1b165..d2d5ef5a3afcd 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -397,3 +397,36 @@ def train_dataloader(self): result = trainer.fit(model) assert result == 1, 'training failed to complete' + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_single_gpu_model(tmpdir): + """Make sure single GPU works (DP mode).""" + trainer_options = dict( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=1, + train_percent_check=0.1, + val_percent_check=0.1, + gpus=1 + ) + + model = EvalModelTemplate() + tutils.run_model_test(trainer_options, model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_auto_move_data(tmpdir): + """Make sure auto moving data works for the base case where it doesn't have to move anything""" + + tutils.reset_seed() + tutils.set_random_master_port() + + model, hparams = tutils.get_default_model() + model.prepare_data() + loader = model.train_dataloader() + for x, y in loader: + x = x.view(x.size(0), -1) + assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" + assert model(x.cuda(0)).device == torch.device('cpu'), "Automoving data to same device as model failed" + diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index baca34f70e0b2..5142b3738f5da 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -97,6 +97,7 @@ def test_multi_gpu_none_backend(tmpdir): tutils.run_model_test(trainer_options, model) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_auto_move_data(tmpdir): """Make sure auto moving data works""" @@ -113,6 +114,7 @@ def test_auto_move_data(tmpdir): for x, y in loader: x = x.view(x.size(0), -1) assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" + assert model(x.cuda(0)).device == torch.device('cuda:0'), "Automoving data to same device as model failed" @pytest.fixture From d7e64f4e3552b385ea859bbfacbaab419b5d0bce Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 05:48:26 +0100 Subject: [PATCH 09/38] Make sure we stick with pep8 formatting --- tests/models/test_cpu.py | 3 ++- tests/models/test_gpu.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index d2d5ef5a3afcd..cc98b7f806961 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -428,5 +428,6 @@ def test_auto_move_data(tmpdir): for x, y in loader: x = x.view(x.size(0), -1) assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" - assert model(x.cuda(0)).device == torch.device('cpu'), "Automoving data to same device as model failed" + x = x.cuda(0) + assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 5142b3738f5da..21ec70c34da08 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -114,7 +114,8 @@ def test_auto_move_data(tmpdir): for x, y in loader: x = x.view(x.size(0), -1) assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" - assert model(x.cuda(0)).device == torch.device('cuda:0'), "Automoving data to same device as model failed" + x = x.cuda(0) + assert model().device == torch.device('cuda:0'), "Automoving data to same device as model failed" @pytest.fixture From ba0ddae4131db35ea2603858edb3f1a020fad34f Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Sun, 19 Apr 2020 06:01:01 +0100 Subject: [PATCH 10/38] Clarify docs a little --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index f4c248e1057c6..41741567fd30f 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -152,7 +152,7 @@ def forward(self, *args, **kwargs): Same as :meth:`torch.nn.Module.forward()`, however in Lightning you want this to define the operations you want to use for prediction (i.e.: on a server or as a feature extractor). LightningModule will also automatically copy data to the same device as the model if the model - is on CPU or GPU + is on CPU or GPU for inference. Normally you'd call ``self()`` from your :meth:`training_step` method. This makes it easy to write a complex system for training with the outputs From cbfd6149ba69bc0055471bc45a333b07e607fa20 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sun, 19 Apr 2020 11:38:25 +0200 Subject: [PATCH 11/38] Apply suggestions from code review --- pytorch_lightning/core/lightning.py | 22 +++++++++------------- tests/models/test_gpu.py | 3 --- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 41741567fd30f..d0b18938bba5a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -116,19 +116,14 @@ def __transfer_data_to_device(self, batch, device): if callable(getattr(batch, 'to', None)): return batch.to(device=device) - # when list - if isinstance(batch, list): + # when list or tuple + if isinstance(batch, (list, tuple)): + if isinstance(batch, tuple): + batch = list(batch) for i, x in enumerate(batch): batch[i] = self.__transfer_data_to_device(x, device) return batch - # when tuple - if isinstance(batch, tuple): - batch = list(batch) - for i, x in enumerate(batch): - batch[i] = self.__transfer_data_to_device(x, device) - return tuple(batch) - # when dict if isinstance(batch, dict): for k, v in batch.items(): @@ -140,10 +135,11 @@ def __transfer_data_to_device(self, batch, device): return batch def __call__(self, *input_data, **kwargs): - device = [p.device for p in self.parameters()] - assert all([device[0] == d for d in device]), 'All parameters must be on same device' - input_data = self.__transfer_data_to_device(input_data, device[0]) - kwargs = self.__transfer_data_to_device(kwargs, device[0]) + devices = [p.device for p in self.parameters()] + assert set(devices) == 1, 'All parameters must be on same device' + device = devices[0] + input_data = self.__transfer_data_to_device(input_data, device) + kwargs = self.__transfer_data_to_device(kwargs, device) return super(LightningModule, self).__call__(*input_data, *kwargs) @abstractmethod diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 21ec70c34da08..51ecdc5e41807 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -108,9 +108,6 @@ def test_auto_move_data(tmpdir): model = model.cuda(0) model.prepare_data() loader = model.train_dataloader() - correct_device = [p.device for p in model.parameters()] - assert all([correct_device[0] == d for d in correct_device]), 'All parameters must be on same device' - correct_device = correct_device[0] for x, y in loader: x = x.view(x.size(0), -1) assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" From d2ebd27da5dd41cb9c377a9b85c606dfef1e24e4 Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Mon, 20 Apr 2020 00:29:18 +0100 Subject: [PATCH 12/38] Get everything working again hopefully --- pytorch_lightning/core/lightning.py | 17 ++++++++++------- tests/models/test_gpu.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d0b18938bba5a..94e9a75a15692 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -111,9 +111,11 @@ def __transfer_data_to_device(self, batch, device): # object can be directly moved using `cuda` or `to` if callable(getattr(batch, 'cuda', None)) and device.type == 'cuda': + rank_zero_warn('Auto moving data from {} to {} to match model'.format(batch.device, device)) return batch.cuda(device=device) if callable(getattr(batch, 'to', None)): + rank_zero_warn('Auto moving data from {} to {} to match model'.format(batch.device, device)) return batch.to(device=device) # when list or tuple @@ -128,7 +130,6 @@ def __transfer_data_to_device(self, batch, device): if isinstance(batch, dict): for k, v in batch.items(): batch[k] = self.__transfer_data_to_device(v, device) - return batch # nothing matches, return the value as is without transform @@ -136,11 +137,13 @@ def __transfer_data_to_device(self, batch, device): def __call__(self, *input_data, **kwargs): devices = [p.device for p in self.parameters()] - assert set(devices) == 1, 'All parameters must be on same device' - device = devices[0] - input_data = self.__transfer_data_to_device(input_data, device) - kwargs = self.__transfer_data_to_device(kwargs, device) - return super(LightningModule, self).__call__(*input_data, *kwargs) + # All parameters must be on same device to automove data + # Otherwise we just do what nn.Module does normally + if len(set(devices)) == 1: + device = devices[0] + input_data = self.__transfer_data_to_device(input_data, device) + kwargs = self.__transfer_data_to_device(kwargs, device) + return super(LightningModule, self).__call__(*input_data, **kwargs) @abstractmethod def forward(self, *args, **kwargs): @@ -148,7 +151,7 @@ def forward(self, *args, **kwargs): Same as :meth:`torch.nn.Module.forward()`, however in Lightning you want this to define the operations you want to use for prediction (i.e.: on a server or as a feature extractor). LightningModule will also automatically copy data to the same device as the model if the model - is on CPU or GPU for inference. + is on CPU or a single GPU for inference. Normally you'd call ``self()`` from your :meth:`training_step` method. This makes it easy to write a complex system for training with the outputs diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 51ecdc5e41807..c9698de2366ea 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -112,7 +112,7 @@ def test_auto_move_data(tmpdir): x = x.view(x.size(0), -1) assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" x = x.cuda(0) - assert model().device == torch.device('cuda:0'), "Automoving data to same device as model failed" + assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" @pytest.fixture From 18267b801c4656a91ad44f6b7b3be7d0dc0c8320 Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Mon, 20 Apr 2020 01:19:41 +0100 Subject: [PATCH 13/38] Move data transfer to utilities --- pl_examples/basic_examples/gpu_template.py | 10 ++++- pytorch_lightning/core/lightning.py | 37 +---------------- pytorch_lightning/trainer/distrib_parts.py | 1 + pytorch_lightning/utilities/data.py | 46 ++++++++++++++++++++++ 4 files changed, 58 insertions(+), 36 deletions(-) create mode 100644 pytorch_lightning/utilities/data.py diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index c5fa94a3cf140..d6c20ad2cd44d 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -38,7 +38,15 @@ def main(hparams): # ------------------------ # 3 START TRAINING # ------------------------ - trainer.fit(model) + #trainer.fit(model) + model = model.cuda(0) + model.prepare_data() + loader = model.train_dataloader() + for x, y in loader: + x = x.view(x.size(0), -1) + assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" + x = x.cuda(0) + assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" if __name__ == '__main__': diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 94e9a75a15692..df51320439019 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -102,47 +102,14 @@ def forward(self, x): if self.trainer.proc_rank == 0: print(*args, **kwargs) - # Note this is almost identical to distrib_parts.TrainerDPMixin.__transfer_data_to_device - # Only works for GPU and not TPU for now - def __transfer_data_to_device(self, batch, device): - # base case: nothing to do - if torch.is_tensor(batch) and batch.device == device: - return batch - - # object can be directly moved using `cuda` or `to` - if callable(getattr(batch, 'cuda', None)) and device.type == 'cuda': - rank_zero_warn('Auto moving data from {} to {} to match model'.format(batch.device, device)) - return batch.cuda(device=device) - - if callable(getattr(batch, 'to', None)): - rank_zero_warn('Auto moving data from {} to {} to match model'.format(batch.device, device)) - return batch.to(device=device) - - # when list or tuple - if isinstance(batch, (list, tuple)): - if isinstance(batch, tuple): - batch = list(batch) - for i, x in enumerate(batch): - batch[i] = self.__transfer_data_to_device(x, device) - return batch - - # when dict - if isinstance(batch, dict): - for k, v in batch.items(): - batch[k] = self.__transfer_data_to_device(v, device) - return batch - - # nothing matches, return the value as is without transform - return batch - def __call__(self, *input_data, **kwargs): devices = [p.device for p in self.parameters()] # All parameters must be on same device to automove data # Otherwise we just do what nn.Module does normally if len(set(devices)) == 1: device = devices[0] - input_data = self.__transfer_data_to_device(input_data, device) - kwargs = self.__transfer_data_to_device(kwargs, device) + input_data = transfer_data_to_device(input_data, device.type, device.index) + kwargs = transfer_data_to_device(kwargs, device.type, device.index) return super(LightningModule, self).__call__(*input_data, **kwargs) @abstractmethod diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 9ea54a0e00346..c9655393befb0 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -21,6 +21,7 @@ from pytorch_lightning.utilities import move_data_to_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.data import transfer_data_to_device try: from apex import amp diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py new file mode 100644 index 0000000000000..d75aa5b144d95 --- /dev/null +++ b/pytorch_lightning/utilities/data.py @@ -0,0 +1,46 @@ +import torch + +try: + import torch_xla.core.xla_model as xm +except ImportError: + XLA_AVAILABLE = False +else: + XLA_AVAILABLE = True + + +# Utility function to copy data to given device +# Works for any form of nested lists, tuples or dictionaries containting tensors +def transfer_data_to_device(batch, device_type, idx=None): + # Deal with TPUs separately, they don't use device indexes for some reason + if device_type == 'tpu' and XLA_AVAILABLE: + if callable(getattr(batch, 'to', None)): + return batch.to(xm.xla_device()) + + # base case: nothing to do + device = torch.device(device_type, idx) + if torch.is_tensor(batch) and batch.device == device: + return batch + + # object can be directly moved using `cuda` or `to` + if callable(getattr(batch, 'cuda', None)) and device_type == 'cuda': + return batch.cuda(device=device) + + if callable(getattr(batch, 'to', None)): + return batch.to(device=device) + + # when list or tuple + if isinstance(batch, (list, tuple)): + if isinstance(batch, tuple): + batch = list(batch) + for i, x in enumerate(batch): + batch[i] = transfer_data_to_device(x, device_type, idx) + return batch + + # when dict + if isinstance(batch, dict): + for k, v in batch.items(): + batch[k] = transfer_data_to_device(v, device_type, idx) + return batch + + # nothing matches, return the value as is without transform + return batch From a539cc7a0f31ec0c3bddabe5ece3e4d7e478588e Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Mon, 20 Apr 2020 01:50:51 +0100 Subject: [PATCH 14/38] Add back in warnings for autotransfer --- pytorch_lightning/core/lightning.py | 8 ++++---- pytorch_lightning/utilities/data.py | 14 +++++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index df51320439019..47a4aca0522d8 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -102,15 +102,15 @@ def forward(self, x): if self.trainer.proc_rank == 0: print(*args, **kwargs) - def __call__(self, *input_data, **kwargs): + def __call__(self, *data, **kwargs): devices = [p.device for p in self.parameters()] # All parameters must be on same device to automove data # Otherwise we just do what nn.Module does normally if len(set(devices)) == 1: device = devices[0] - input_data = transfer_data_to_device(input_data, device.type, device.index) - kwargs = transfer_data_to_device(kwargs, device.type, device.index) - return super(LightningModule, self).__call__(*input_data, **kwargs) + data = transfer_data_to_device(data, device.type, device.index, warn_on_transfer=True) + kwargs = transfer_data_to_device(kwargs, device.type, device.index, warn_on_transfer=True) + return super(LightningModule, self).__call__(*data, **kwargs) @abstractmethod def forward(self, *args, **kwargs): diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py index d75aa5b144d95..c489d3de6a539 100644 --- a/pytorch_lightning/utilities/data.py +++ b/pytorch_lightning/utilities/data.py @@ -1,5 +1,7 @@ import torch +from pytorch_lightning.utilities import rank_zero_warn + try: import torch_xla.core.xla_model as xm except ImportError: @@ -10,10 +12,12 @@ # Utility function to copy data to given device # Works for any form of nested lists, tuples or dictionaries containting tensors -def transfer_data_to_device(batch, device_type, idx=None): +def transfer_data_to_device(batch, device_type, idx=None, warn_on_transfer=False): # Deal with TPUs separately, they don't use device indexes for some reason if device_type == 'tpu' and XLA_AVAILABLE: if callable(getattr(batch, 'to', None)): + if warn_on_transfer: + rank_zero_warn('Auto transferred data to device {}'.format(xm.xla_device())) return batch.to(xm.xla_device()) # base case: nothing to do @@ -23,9 +27,13 @@ def transfer_data_to_device(batch, device_type, idx=None): # object can be directly moved using `cuda` or `to` if callable(getattr(batch, 'cuda', None)) and device_type == 'cuda': + if warn_on_transfer: + rank_zero_warn('Auto transferred data to device {}'.format(device)) return batch.cuda(device=device) if callable(getattr(batch, 'to', None)): + if warn_on_transfer: + rank_zero_warn('Auto transferred data to device {}'.format(device)) return batch.to(device=device) # when list or tuple @@ -33,13 +41,13 @@ def transfer_data_to_device(batch, device_type, idx=None): if isinstance(batch, tuple): batch = list(batch) for i, x in enumerate(batch): - batch[i] = transfer_data_to_device(x, device_type, idx) + batch[i] = transfer_data_to_device(x, device_type, idx, warn_on_transfer) return batch # when dict if isinstance(batch, dict): for k, v in batch.items(): - batch[k] = transfer_data_to_device(v, device_type, idx) + batch[k] = transfer_data_to_device(v, device_type, idx, warn_on_transfer) return batch # nothing matches, return the value as is without transform From a6500dbd98a044696818b5e7b9d05df2d76fb5aa Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Mon, 20 Apr 2020 01:51:55 +0100 Subject: [PATCH 15/38] Get rid of the test code I ended up accidentally commiting again --- pl_examples/basic_examples/gpu_template.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index d6c20ad2cd44d..c5fa94a3cf140 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -38,15 +38,7 @@ def main(hparams): # ------------------------ # 3 START TRAINING # ------------------------ - #trainer.fit(model) - model = model.cuda(0) - model.prepare_data() - loader = model.train_dataloader() - for x, y in loader: - x = x.view(x.size(0), -1) - assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" - x = x.cuda(0) - assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" + trainer.fit(model) if __name__ == '__main__': From 531124a139b808144921f2e1019bafab726a1da4 Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Mon, 20 Apr 2020 14:17:22 +0100 Subject: [PATCH 16/38] Add docs any changelog --- CHANGELOG.md | 4 ++++ pytorch_lightning/core/lightning.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 321319d6a216c..7cc11f3f6e9c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,6 +94,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added using `store_true` for bool args ([#1822](https://github.com/PyTorchLightning/pytorch-lightning/pull/1822), [#1842](https://github.com/PyTorchLightning/pytorch-lightning/pull/1842)) - Added dummy logger for internally disabling logging for some features ([#1836](https://github.com/PyTorchLightning/pytorch-lightning/pull/1836)) +- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)). + +- Added automatic GPU data transfer to single GPU and CPU inference ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1526)) + ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 47a4aca0522d8..112cc017706d4 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -103,6 +103,25 @@ def forward(self, x): print(*args, **kwargs) def __call__(self, *data, **kwargs): + r""" + Automatically moves data to correct device if possible, then call torch.nn.Module.__call__ + Lightning will warn you if it automatically moves any data + + Args: + *data: Any positional arguments for torch.nn.Module.__call__. These are typically input data + **kwargs: Any keyword arguments for torch.nn.Module.__call__ + + Example: + + .. code-block:: python + + model = model.cuda(0) + model.prepare_data() + loader = model.train_dataloader() + for x, y in loader: + output = model(x) # Lightning will automove data here and warn you of it + + """ devices = [p.device for p in self.parameters()] # All parameters must be on same device to automove data # Otherwise we just do what nn.Module does normally From 8996c4c935b5761c8820caacd039b51dccc14449 Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Mon, 20 Apr 2020 14:31:15 +0100 Subject: [PATCH 17/38] Correct PR number in Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cc11f3f6e9c6..7efcc5b6bc559 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,6 +98,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added automatic GPU data transfer to single GPU and CPU inference ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1526)) +- Added automatic GPU data transfer to single GPU and CPU inference ([#1526](https://github.com/PyTorchLightning/pytorch-lightning/pull/1526)) + ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) From 2c50252ec5a23a64cfa510b2a2d0d8e55d37025b Mon Sep 17 00:00:00 2001 From: Hengjian Jia Date: Wed, 22 Apr 2020 20:47:59 +0100 Subject: [PATCH 18/38] Correct changelog --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7efcc5b6bc559..7cc11f3f6e9c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -98,8 +98,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added automatic GPU data transfer to single GPU and CPU inference ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1526)) -- Added automatic GPU data transfer to single GPU and CPU inference ([#1526](https://github.com/PyTorchLightning/pytorch-lightning/pull/1526)) - ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) From 8159ed8805b055c6e297686bf1a16f962d47b38d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 30 Apr 2020 08:11:57 -0400 Subject: [PATCH 19/38] Update data.py --- pytorch_lightning/utilities/data.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py index c489d3de6a539..a6f79ff0c05e8 100644 --- a/pytorch_lightning/utilities/data.py +++ b/pytorch_lightning/utilities/data.py @@ -10,10 +10,12 @@ XLA_AVAILABLE = True -# Utility function to copy data to given device -# Works for any form of nested lists, tuples or dictionaries containting tensors def transfer_data_to_device(batch, device_type, idx=None, warn_on_transfer=False): - # Deal with TPUs separately, they don't use device indexes for some reason + """ + Utility function to copy data to given device + Works for any form of nested lists, tuples or dictionaries containting tensors + Deal with TPUs separately, they don't use device indexes for some reason + """ if device_type == 'tpu' and XLA_AVAILABLE: if callable(getattr(batch, 'to', None)): if warn_on_transfer: From b485cd8b3baa1929f5842e7e5258c1c8e6587a31 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 17 May 2020 08:52:00 -0400 Subject: [PATCH 20/38] Update test_cpu.py --- tests/models/test_cpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index cc98b7f806961..5d20547ab20ce 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -430,4 +430,3 @@ def test_auto_move_data(tmpdir): assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" x = x.cuda(0) assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" - From f8218e049b7adc9d83430f1f414487c624da916b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 20 May 2020 20:24:17 +0200 Subject: [PATCH 21/38] make a decorator --- pytorch_lightning/core/decorators.py | 36 +++++++++++++++++++++++++++- pytorch_lightning/core/lightning.py | 29 ---------------------- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 3979a4fc6f7ee..0553615e4d204 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,4 +1,5 @@ -from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning import LightningModule +from pytorch_lightning.utilities import rank_zero_warn, transfer_batch_to_device def data_loader(fn): @@ -12,3 +13,36 @@ def data_loader(fn): def inner_fx(self): return fn(self) return inner_fx + + +def auto_move_data(fn): + """ + Decorator for :class:`~pytorch_lightning.core.lightning.LightningModule` methods for which + input arguments should be moved automatically to the correct device. + It as no effect if applied to a method of an object that is not an instance of + :class:`~pytorch_lightning.core.lightning.LightningModule` and is typically applied to ``__call__`` + or ``forward``. + + Args: + fn: A LightningModule method for which the arguments should be moved to the device + the parameters are on. + + Example: + + .. code-block:: python + + model = model.cuda(0) + model.prepare_data() + loader = model.train_dataloader() + for x, y in loader: + output = model(x) + """ + def auto_transfer_args(self, *args, **kwargs): + if not isinstance(self, LightningModule): + return fn(self, *args, **kwargs) + + args = transfer_batch_to_device(args, self.device) + kwargs = transfer_batch_to_device(kwargs, self.device) + return fn(self, *args, **kwargs) + + return auto_transfer_args diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 112cc017706d4..c9b75894a0d2c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -102,35 +102,6 @@ def forward(self, x): if self.trainer.proc_rank == 0: print(*args, **kwargs) - def __call__(self, *data, **kwargs): - r""" - Automatically moves data to correct device if possible, then call torch.nn.Module.__call__ - Lightning will warn you if it automatically moves any data - - Args: - *data: Any positional arguments for torch.nn.Module.__call__. These are typically input data - **kwargs: Any keyword arguments for torch.nn.Module.__call__ - - Example: - - .. code-block:: python - - model = model.cuda(0) - model.prepare_data() - loader = model.train_dataloader() - for x, y in loader: - output = model(x) # Lightning will automove data here and warn you of it - - """ - devices = [p.device for p in self.parameters()] - # All parameters must be on same device to automove data - # Otherwise we just do what nn.Module does normally - if len(set(devices)) == 1: - device = devices[0] - data = transfer_data_to_device(data, device.type, device.index, warn_on_transfer=True) - kwargs = transfer_data_to_device(kwargs, device.type, device.index, warn_on_transfer=True) - return super(LightningModule, self).__call__(*data, **kwargs) - @abstractmethod def forward(self, *args, **kwargs): r""" From d01934ca8e1f89ccdd0ebe09b073f0d9085aa6a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 20 May 2020 20:32:28 +0200 Subject: [PATCH 22/38] type hint --- pytorch_lightning/core/decorators.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 0553615e4d204..d4ef18798d9f9 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,3 +1,5 @@ +from typing import Callable + from pytorch_lightning import LightningModule from pytorch_lightning.utilities import rank_zero_warn, transfer_batch_to_device @@ -15,7 +17,7 @@ def inner_fx(self): return inner_fx -def auto_move_data(fn): +def auto_move_data(fn: Callable) -> Callable: """ Decorator for :class:`~pytorch_lightning.core.lightning.LightningModule` methods for which input arguments should be moved automatically to the correct device. From 16d54609d843727ce9eccb00b42490ba6eb134ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 20 May 2020 20:49:53 +0200 Subject: [PATCH 23/38] changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7cc11f3f6e9c6..50bca08691b63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([1610](https://github.com/PyTorchLightning/pytorch-lightning/pull/1610)) +- Added the option to automatically move data to the correct device when using the LightningModule's forward for inference ([#1905](https://github.com/PyTorchLightning/pytorch-lightning/pull/1905)) + ### Changed - Allow user to select individual TPU core to train on ([#1729](https://github.com/PyTorchLightning/pytorch-lightning/pull/1729)) @@ -96,8 +98,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)). -- Added automatic GPU data transfer to single GPU and CPU inference ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1526)) - ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) From f19e6e37379476ac9e890ad5ea64d9d5015ed766 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 20 May 2020 20:52:44 +0200 Subject: [PATCH 24/38] changelog --- CHANGELOG.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50bca08691b63..3ed254ac0175b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -96,8 +96,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added using `store_true` for bool args ([#1822](https://github.com/PyTorchLightning/pytorch-lightning/pull/1822), [#1842](https://github.com/PyTorchLightning/pytorch-lightning/pull/1842)) - Added dummy logger for internally disabling logging for some features ([#1836](https://github.com/PyTorchLightning/pytorch-lightning/pull/1836)) -- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)). - ### Changed - Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) From a035785cd56821002a6334fffd815b157e5f31f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 20 May 2020 20:56:02 +0200 Subject: [PATCH 25/38] remove old function --- pytorch_lightning/utilities/data.py | 56 ----------------------------- 1 file changed, 56 deletions(-) delete mode 100644 pytorch_lightning/utilities/data.py diff --git a/pytorch_lightning/utilities/data.py b/pytorch_lightning/utilities/data.py deleted file mode 100644 index a6f79ff0c05e8..0000000000000 --- a/pytorch_lightning/utilities/data.py +++ /dev/null @@ -1,56 +0,0 @@ -import torch - -from pytorch_lightning.utilities import rank_zero_warn - -try: - import torch_xla.core.xla_model as xm -except ImportError: - XLA_AVAILABLE = False -else: - XLA_AVAILABLE = True - - -def transfer_data_to_device(batch, device_type, idx=None, warn_on_transfer=False): - """ - Utility function to copy data to given device - Works for any form of nested lists, tuples or dictionaries containting tensors - Deal with TPUs separately, they don't use device indexes for some reason - """ - if device_type == 'tpu' and XLA_AVAILABLE: - if callable(getattr(batch, 'to', None)): - if warn_on_transfer: - rank_zero_warn('Auto transferred data to device {}'.format(xm.xla_device())) - return batch.to(xm.xla_device()) - - # base case: nothing to do - device = torch.device(device_type, idx) - if torch.is_tensor(batch) and batch.device == device: - return batch - - # object can be directly moved using `cuda` or `to` - if callable(getattr(batch, 'cuda', None)) and device_type == 'cuda': - if warn_on_transfer: - rank_zero_warn('Auto transferred data to device {}'.format(device)) - return batch.cuda(device=device) - - if callable(getattr(batch, 'to', None)): - if warn_on_transfer: - rank_zero_warn('Auto transferred data to device {}'.format(device)) - return batch.to(device=device) - - # when list or tuple - if isinstance(batch, (list, tuple)): - if isinstance(batch, tuple): - batch = list(batch) - for i, x in enumerate(batch): - batch[i] = transfer_data_to_device(x, device_type, idx, warn_on_transfer) - return batch - - # when dict - if isinstance(batch, dict): - for k, v in batch.items(): - batch[k] = transfer_data_to_device(v, device_type, idx, warn_on_transfer) - return batch - - # nothing matches, return the value as is without transform - return batch From 2d65d808be6dd7075a21a10accd8591f760b1ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 21 May 2020 16:27:24 +0200 Subject: [PATCH 26/38] import --- pytorch_lightning/core/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index d4ef18798d9f9..aac81e61bbb5a 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,6 +1,6 @@ from typing import Callable -from pytorch_lightning import LightningModule +from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import rank_zero_warn, transfer_batch_to_device From 6e6da533f54ed14152a41a1a4d3e232b708333a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 21 May 2020 19:28:52 +0200 Subject: [PATCH 27/38] test for decorator --- pytorch_lightning/core/decorators.py | 2 ++ pytorch_lightning/trainer/distrib_parts.py | 1 - tests/core/test_decorators.py | 33 ++++++++++++++++++++++ tests/models/test_cpu.py | 17 ----------- 4 files changed, 35 insertions(+), 18 deletions(-) create mode 100644 tests/core/test_decorators.py diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index aac81e61bbb5a..cc74cfc71d83f 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,3 +1,4 @@ +from functools import wraps from typing import Callable from pytorch_lightning.core.lightning import LightningModule @@ -39,6 +40,7 @@ def auto_move_data(fn: Callable) -> Callable: for x, y in loader: output = model(x) """ + @wraps(fn) def auto_transfer_args(self, *args, **kwargs): if not isinstance(self, LightningModule): return fn(self, *args, **kwargs) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index c9655393befb0..9ea54a0e00346 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -21,7 +21,6 @@ from pytorch_lightning.utilities import move_data_to_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning.utilities.data import transfer_data_to_device try: from apex import amp diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py new file mode 100644 index 0000000000000..1d0bbd086934a --- /dev/null +++ b/tests/core/test_decorators.py @@ -0,0 +1,33 @@ +import pytest +import torch + +from tests.base import EvalModelTemplate +from pytorch_lightning.core.decorators import auto_move_data + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.parametrize(['src_device', 'dest_device'], [ + pytest.param(torch.device('cpu'), torch.device('cpu')), + pytest.param(torch.device('cpu', 0), torch.device('cuda', 0)), + pytest.param(torch.device('cuda', 0), torch.device('cpu')), + pytest.param(torch.device('cuda', 0), torch.device('cuda', 0)), +]) +def test_auto_move_data(src_device, dest_device): + """ Test that the decorator moves the data to the device the model is on. """ + + class CurrentModel(EvalModelTemplate): + + # @auto_move_data + def forward(self, *args, **kwargs): + return super().forward(*args, **kwargs) + + model = CurrentModel().to(dest_device) + # setattr(model, 'forward', auto_move_data(model.forward)) + model.forward = auto_move_data(model.forward) # apply the decorator + model.prepare_data() + loader = model.train_dataloader() + + x, y, = next(iter(loader)) + x = x.flatten(1) + x = x.to(src_device) + assert model(x).device == dest_device, "Automoving data to same device as model failed" diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 5d20547ab20ce..6ccc0873db4fd 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -413,20 +413,3 @@ def test_single_gpu_model(tmpdir): model = EvalModelTemplate() tutils.run_model_test(trainer_options, model) - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_auto_move_data(tmpdir): - """Make sure auto moving data works for the base case where it doesn't have to move anything""" - - tutils.reset_seed() - tutils.set_random_master_port() - - model, hparams = tutils.get_default_model() - model.prepare_data() - loader = model.train_dataloader() - for x, y in loader: - x = x.view(x.size(0), -1) - assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" - x = x.cuda(0) - assert model(x).device == torch.device('cpu'), "Automoving data to same device as model failed" From 454d7e233244e67c2c1eba6b604ae2a260b4146d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 21 May 2020 19:53:46 +0200 Subject: [PATCH 28/38] fix test --- tests/core/test_decorators.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py index 1d0bbd086934a..0f35a1630e1d9 100644 --- a/tests/core/test_decorators.py +++ b/tests/core/test_decorators.py @@ -16,18 +16,18 @@ def test_auto_move_data(src_device, dest_device): """ Test that the decorator moves the data to the device the model is on. """ class CurrentModel(EvalModelTemplate): + pass - # @auto_move_data - def forward(self, *args, **kwargs): - return super().forward(*args, **kwargs) + # apply the decorator + CurrentModel.forward = auto_move_data(CurrentModel.forward) - model = CurrentModel().to(dest_device) - # setattr(model, 'forward', auto_move_data(model.forward)) - model.forward = auto_move_data(model.forward) # apply the decorator + model = CurrentModel() + model = model.to(dest_device) model.prepare_data() loader = model.train_dataloader() - x, y, = next(iter(loader)) x = x.flatten(1) + + # test that data on source device gets moved to destination device x = x.to(src_device) assert model(x).device == dest_device, "Automoving data to same device as model failed" From d8e1bd7e05dc783a738d3104830e9019104c35de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 21 May 2020 21:01:46 +0200 Subject: [PATCH 29/38] remove old test --- tests/models/test_gpu.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index c9698de2366ea..80249a727ccbb 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -97,24 +97,6 @@ def test_multi_gpu_none_backend(tmpdir): tutils.run_model_test(trainer_options, model) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_auto_move_data(tmpdir): - """Make sure auto moving data works""" - - tutils.reset_seed() - tutils.set_random_master_port() - - model, hparams = tutils.get_default_model() - model = model.cuda(0) - model.prepare_data() - loader = model.train_dataloader() - for x, y in loader: - x = x.view(x.size(0), -1) - assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" - x = x.cuda(0) - assert model(x).device == torch.device('cuda:0'), "Automoving data to same device as model failed" - - @pytest.fixture def mocked_device_count(monkeypatch): def device_count(): From ecc1d6e7762e902e0d20c08958091cfed3b8d3a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Jun 2020 00:37:41 +0200 Subject: [PATCH 30/38] doctest --- pytorch_lightning/core/decorators.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index cc74cfc71d83f..15843492293cb 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,8 +1,10 @@ from functools import wraps from typing import Callable +import torch + from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import rank_zero_warn, transfer_batch_to_device +from pytorch_lightning.utilities import rank_zero_warn def data_loader(fn): @@ -32,21 +34,24 @@ def auto_move_data(fn: Callable) -> Callable: Example: - .. code-block:: python + >>> class LitModel(LightningModule): + ... @auto_move_data + ... def forward(self, x): + ... return x + >>> LitModel.forward = auto_move_data(LitModel.forward) + >>> model = LitModel() + >>> model = model.to('cuda') + >>> model(torch.zeros(1, 3)) + tensor([[0., 0., 0.]], device='cuda:0') - model = model.cuda(0) - model.prepare_data() - loader = model.train_dataloader() - for x, y in loader: - output = model(x) """ @wraps(fn) def auto_transfer_args(self, *args, **kwargs): if not isinstance(self, LightningModule): return fn(self, *args, **kwargs) - args = transfer_batch_to_device(args, self.device) - kwargs = transfer_batch_to_device(kwargs, self.device) + args = self.transfer_batch_to_device(args, self.device) + kwargs = self.transfer_batch_to_device(kwargs, self.device) return fn(self, *args, **kwargs) return auto_transfer_args From 0dd89856f17ff7c1f6711a6cd69dc4da69ac24b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Jun 2020 20:20:53 +0200 Subject: [PATCH 31/38] apply decorator directly --- tests/core/test_decorators.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py index 0f35a1630e1d9..e23db3d74c97c 100644 --- a/tests/core/test_decorators.py +++ b/tests/core/test_decorators.py @@ -15,13 +15,10 @@ def test_auto_move_data(src_device, dest_device): """ Test that the decorator moves the data to the device the model is on. """ - class CurrentModel(EvalModelTemplate): - pass - # apply the decorator - CurrentModel.forward = auto_move_data(CurrentModel.forward) + EvalModelTemplate.forward = auto_move_data(EvalModelTemplate.forward) - model = CurrentModel() + model = EvalModelTemplate() model = model.to(dest_device) model.prepare_data() loader = model.train_dataloader() From a1ddb86fef38511e3b385bd99f6ffb622e684cab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Jun 2020 02:02:26 +0200 Subject: [PATCH 32/38] convert doctest to code block --- pytorch_lightning/core/decorators.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index 15843492293cb..8f2721201a124 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -34,15 +34,24 @@ def auto_move_data(fn: Callable) -> Callable: Example: - >>> class LitModel(LightningModule): - ... @auto_move_data - ... def forward(self, x): - ... return x - >>> LitModel.forward = auto_move_data(LitModel.forward) - >>> model = LitModel() - >>> model = model.to('cuda') - >>> model(torch.zeros(1, 3)) - tensor([[0., 0., 0.]], device='cuda:0') + .. code-block:: python + + # directly in the source code + class LitModel(LightningModule): + + @auto_move_data + def forward(self, x): + return x + + # or outside + LitModel.forward = auto_move_data(LitModel.forward) + + model = LitModel() + model = model.to('cuda') + model(torch.zeros(1, 3)) + + # input gets moved to device + # tensor([[0., 0., 0.]], device='cuda:0') """ @wraps(fn) From cc4cb6b2678f7279d037b63c0957f73fd30ae2e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Jun 2020 19:30:33 +0200 Subject: [PATCH 33/38] prevent side effects in tests --- tests/core/test_decorators.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/core/test_decorators.py b/tests/core/test_decorators.py index e23db3d74c97c..0f35a1630e1d9 100644 --- a/tests/core/test_decorators.py +++ b/tests/core/test_decorators.py @@ -15,10 +15,13 @@ def test_auto_move_data(src_device, dest_device): """ Test that the decorator moves the data to the device the model is on. """ + class CurrentModel(EvalModelTemplate): + pass + # apply the decorator - EvalModelTemplate.forward = auto_move_data(EvalModelTemplate.forward) + CurrentModel.forward = auto_move_data(CurrentModel.forward) - model = EvalModelTemplate() + model = CurrentModel() model = model.to(dest_device) model.prepare_data() loader = model.train_dataloader() From 42fc1b82749734d8144b448e4be19f210a4e3e40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Jun 2020 19:40:00 +0200 Subject: [PATCH 34/38] fix merge --- tests/models/test_cpu.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py index 6ccc0873db4fd..051db81d1b165 100644 --- a/tests/models/test_cpu.py +++ b/tests/models/test_cpu.py @@ -397,19 +397,3 @@ def train_dataloader(self): result = trainer.fit(model) assert result == 1, 'training failed to complete' - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") -def test_single_gpu_model(tmpdir): - """Make sure single GPU works (DP mode).""" - trainer_options = dict( - default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - train_percent_check=0.1, - val_percent_check=0.1, - gpus=1 - ) - - model = EvalModelTemplate() - tutils.run_model_test(trainer_options, model) From 587a2b27e47cab4a0fb19f33468c7bacc930db04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Jun 2020 19:54:34 +0200 Subject: [PATCH 35/38] update forward docs --- pytorch_lightning/core/lightning.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index c9b75894a0d2c..38fa856a4ed14 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -107,13 +107,14 @@ def forward(self, *args, **kwargs): r""" Same as :meth:`torch.nn.Module.forward()`, however in Lightning you want this to define the operations you want to use for prediction (i.e.: on a server or as a feature extractor). - LightningModule will also automatically copy data to the same device as the model if the model - is on CPU or a single GPU for inference. Normally you'd call ``self()`` from your :meth:`training_step` method. This makes it easy to write a complex system for training with the outputs you'd want in a prediction setting. + You may also find the :func:`pytorch_lightning.core.decorators.auto_move_data` decorator useful + when using the module outside Lightning in a production setting. + Args: *args: Whatever you decide to pass into the forward method. **kwargs: Keyword arguments are also possible. From 649c6c9541c0c81993196d3c979abac429e85220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 8 Jun 2020 19:59:38 +0200 Subject: [PATCH 36/38] update docs --- pytorch_lightning/core/lightning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 38fa856a4ed14..e956f166b706d 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -112,7 +112,7 @@ def forward(self, *args, **kwargs): This makes it easy to write a complex system for training with the outputs you'd want in a prediction setting. - You may also find the :func:`pytorch_lightning.core.decorators.auto_move_data` decorator useful + You may also find the :func:`~pytorch_lightning.core.decorators.auto_move_data` decorator useful when using the module outside Lightning in a production setting. Args: From 8099b0af7743c5f56fe7e81a589eccef4d62300e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 15 Jun 2020 22:03:49 +0200 Subject: [PATCH 37/38] added docs in section "deployment / prediction" --- pytorch_lightning/trainer/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 3a2881fe75121..93fb959c75ec6 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -101,6 +101,11 @@ def forward(self, x): out = pretrained_model(x) api_write({'response': out} + +You may wish to run the model on a variety of devices. Instead of moving the data +manually to the correct device, decorate the forward method (or any other method you use for inference) +with :func:`~pytorch_lightning.core.decorators.auto_move_data` and Lightning will take care of the rest. + ------------ Reproducibility From f5f9e7512f313846a4ba3bc5aeb47f097a358dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 15 Jun 2020 22:07:09 +0200 Subject: [PATCH 38/38] update changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b46f9ae201f0..f5fa80353cab8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,8 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added back the slow spawn ddp implementation as `ddp_spawn` ([#2115](https://github.com/PyTorchLightning/pytorch-lightning/pull/2115)) - Added loading checkpoints from URLs ([#1667](https://github.com/PyTorchLightning/pytorch-lightning/issues/1667)) - Added a callback method `on_keyboard_interrupt` for handling KeyboardInterrupt events during training ([#2134](https://github.com/PyTorchLightning/pytorch-lightning/pull/2134)) - -- Added the option to automatically move data to the correct device when using the LightningModule's forward for inference ([#1905](https://github.com/PyTorchLightning/pytorch-lightning/pull/1905)) +- Added a decorator `auto_move_data` that moves data to the correct device when using the LightningModule for inference ([#1905](https://github.com/PyTorchLightning/pytorch-lightning/pull/1905)) ### Changed