From f9d9194caa293829baf9220c63c48714866d2e83 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 18:51:34 +0530 Subject: [PATCH 01/10] Fix shuffle for distributed sampler --- pytorch_lightning/trainer/data_loading.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 525956d257521..09186765c6eee 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -163,7 +163,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: ' `replace_sampler_ddp`=False if you want to use your custom sampler.') # replace with distributed sampler - sampler = self._get_distributed_sampler(dataloader) + sampler = self._get_distributed_sampler(dataloader, train) dataloader = self.replace_sampler(dataloader, sampler) return dataloader @@ -179,7 +179,7 @@ def replace_sampler(self, dataloader, sampler): dataloader = type(dataloader)(**dl_args) return dataloader - def _get_distributed_sampler(self, dataloader): + def _get_distributed_sampler(self, dataloader, train): if self.use_tpu: kwargs = dict(num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.use_horovod: @@ -193,6 +193,8 @@ def _get_distributed_sampler(self, dataloader): } assert self.distributed_backend is not None kwargs = dict(num_replicas=world_size[self.distributed_backend], rank=self.global_rank) + + kwargs['shuffle'] = train sampler = DistributedSampler(dataloader.dataset, **kwargs) return sampler From aaa1726a058117c3f891639c70492588363be3a2 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 22:14:03 +0530 Subject: [PATCH 02/10] add test --- tests/trainer/test_dataloaders.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 23aa7547a689d..b4f601a1c3a79 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -7,6 +7,7 @@ from packaging.version import parse from torch.utils.data.dataloader import DataLoader from torch.utils.data.dataset import IterableDataset, Subset +from torch.utils.data.distributed import DistributedSampler import tests.base.develop_pipelines as tpipes from pytorch_lightning import Trainer @@ -640,6 +641,31 @@ class CustomSampler(torch.utils.data.Sampler): CustomDataLoader(list(range(1000)), sampler=CustomSampler(list(range(1000)))), train=True) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason='Test requires multiple GPUs') +def test_dataloader_distributed_sampler(tmpdir): + """ Test DistributedSampler and it's arguments for DDP backend """ + + model = EvalModelTemplate() + + trainer = Trainer( + gpus=[0, 1], + num_nodes=1, + distributed_backend='ddp_spawn', + default_root_dir=tmpdir + ) + + trainer.fit() + trainer.test(ckpt_path=None) + + assert isinstance(model.train_dataloader().sampler, DistributedSampler) + assert isinstance(model.val_dataloader().sampler, DistributedSampler) + assert isinstance(model.val_dataloader().sampler, DistributedSampler) + + assert model.train_dataloader().sampler.shuffle + assert not model.val_dataloader().sampler.shuffle + assert not model.test_dataloader().sampler.shuffle + + @pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs') def test_batch_size_smaller_than_num_gpus(tmpdir): # we need at least 3 gpus for this test From 668e465cd4511013699ea9b77f4e7f8085d299d3 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 22:16:48 +0530 Subject: [PATCH 03/10] test --- tests/trainer/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index b4f601a1c3a79..0f4db6fcc8529 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -659,7 +659,7 @@ def test_dataloader_distributed_sampler(tmpdir): assert isinstance(model.train_dataloader().sampler, DistributedSampler) assert isinstance(model.val_dataloader().sampler, DistributedSampler) - assert isinstance(model.val_dataloader().sampler, DistributedSampler) + assert isinstance(model.test_dataloader().sampler, DistributedSampler) assert model.train_dataloader().sampler.shuffle assert not model.val_dataloader().sampler.shuffle From ffeba8023eda0bf8c1c29598ff347deaea581028 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 22:37:37 +0530 Subject: [PATCH 04/10] chlog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c44db54cac5a1..ac509ae5b5a88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed data transfer to device when using `torchtext.data.Field` and `include_lengths is True` ([#2689](https://github.com/PyTorchLightning/pytorch-lightning/pull/2689)) +- Fixed shuffle argument for distributed sampler ([#2789](https://github.com/PyTorchLightning/pytorch-lightning/pull/2789)) + ## [0.8.5] - 2020-07-09 ### Added From 468f951bc7c4677eab1ce69f4d99d7dbaea742eb Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 22:44:02 +0530 Subject: [PATCH 05/10] update test --- tests/trainer/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 0f4db6fcc8529..166d5ca1591e5 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -654,7 +654,7 @@ def test_dataloader_distributed_sampler(tmpdir): default_root_dir=tmpdir ) - trainer.fit() + trainer.fit(model) trainer.test(ckpt_path=None) assert isinstance(model.train_dataloader().sampler, DistributedSampler) From 894f45aedde05c1b5ff5919cae732819aa5dc72c Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 22:53:58 +0530 Subject: [PATCH 06/10] update test --- tests/trainer/test_dataloaders.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 166d5ca1591e5..3c9d78c97b524 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -651,7 +651,8 @@ def test_dataloader_distributed_sampler(tmpdir): gpus=[0, 1], num_nodes=1, distributed_backend='ddp_spawn', - default_root_dir=tmpdir + default_root_dir=tmpdir, + max_epochs=1 ) trainer.fit(model) From 06b993c345ad5bdc57f534d76db2c1dc92bea361 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 23:27:25 +0530 Subject: [PATCH 07/10] update test --- tests/trainer/test_dataloaders.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 3c9d78c97b524..485d15661728a 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -658,13 +658,17 @@ def test_dataloader_distributed_sampler(tmpdir): trainer.fit(model) trainer.test(ckpt_path=None) - assert isinstance(model.train_dataloader().sampler, DistributedSampler) - assert isinstance(model.val_dataloader().sampler, DistributedSampler) - assert isinstance(model.test_dataloader().sampler, DistributedSampler) + train_sampler = trainer.train_dataloader.sampler + val_sampler = trainer.val_dataloaders[0].sampler + test_sampler = trainer.test_dataloaders[0].sampler - assert model.train_dataloader().sampler.shuffle - assert not model.val_dataloader().sampler.shuffle - assert not model.test_dataloader().sampler.shuffle + assert isinstance(train_sampler, DistributedSampler) + assert isinstance(val_sampler, DistributedSampler) + assert isinstance(test_sampler, DistributedSampler) + + assert train_sampler.shuffle + assert not val_sampler.shuffle + assert not test_sampler.shuffle @pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs') From deb268290289f05b961b2391e508f8a8c508f8c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 1 Aug 2020 22:19:36 +0200 Subject: [PATCH 08/10] assertions via callback --- tests/trainer/test_dataloaders.py | 35 ++++++++++++++++++------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 485d15661728a..856d6f75a851b 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -10,7 +10,7 @@ from torch.utils.data.distributed import DistributedSampler import tests.base.develop_pipelines as tpipes -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, Callback from pytorch_lightning.trainer.data_loading import _has_iterable_dataset, _has_len from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate @@ -645,6 +645,23 @@ class CustomSampler(torch.utils.data.Sampler): def test_dataloader_distributed_sampler(tmpdir): """ Test DistributedSampler and it's arguments for DDP backend """ + class DistribCallback(Callback): + + def on_train_start(self, trainer, pl_module): + train_sampler = trainer.train_dataloader.sampler + assert isinstance(train_sampler, DistributedSampler) + assert train_sampler.shuffle + + def on_validation_start(self, trainer, pl_module): + val_sampler = trainer.val_dataloaders[0].sampler + assert isinstance(val_sampler, DistributedSampler) + assert not val_sampler.shuffle + + def on_test_start(self, trainer, pl_module): + test_sampler = trainer.test_dataloaders[0].sampler + assert isinstance(test_sampler, DistributedSampler) + assert not test_sampler.shuffle + model = EvalModelTemplate() trainer = Trainer( @@ -652,24 +669,12 @@ def test_dataloader_distributed_sampler(tmpdir): num_nodes=1, distributed_backend='ddp_spawn', default_root_dir=tmpdir, - max_epochs=1 + max_steps=1, + callbacks=[DistribCallback()] ) - trainer.fit(model) trainer.test(ckpt_path=None) - train_sampler = trainer.train_dataloader.sampler - val_sampler = trainer.val_dataloaders[0].sampler - test_sampler = trainer.test_dataloaders[0].sampler - - assert isinstance(train_sampler, DistributedSampler) - assert isinstance(val_sampler, DistributedSampler) - assert isinstance(test_sampler, DistributedSampler) - - assert train_sampler.shuffle - assert not val_sampler.shuffle - assert not test_sampler.shuffle - @pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs') def test_batch_size_smaller_than_num_gpus(tmpdir): From 5f0b094aa33ea61d392891981f0162c1edfd3940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 1 Aug 2020 22:35:22 +0200 Subject: [PATCH 09/10] define callback outside for pickling --- tests/trainer/test_dataloaders.py | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 856d6f75a851b..0b471e1f2a94b 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -641,36 +641,36 @@ class CustomSampler(torch.utils.data.Sampler): CustomDataLoader(list(range(1000)), sampler=CustomSampler(list(range(1000)))), train=True) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason='Test requires multiple GPUs') -def test_dataloader_distributed_sampler(tmpdir): - """ Test DistributedSampler and it's arguments for DDP backend """ +class DistribSamplerCallback(Callback): - class DistribCallback(Callback): + def on_train_start(self, trainer, pl_module): + train_sampler = trainer.train_dataloader.sampler + assert isinstance(train_sampler, DistributedSampler) + assert train_sampler.shuffle - def on_train_start(self, trainer, pl_module): - train_sampler = trainer.train_dataloader.sampler - assert isinstance(train_sampler, DistributedSampler) - assert train_sampler.shuffle + def on_validation_start(self, trainer, pl_module): + val_sampler = trainer.val_dataloaders[0].sampler + assert isinstance(val_sampler, DistributedSampler) + assert not val_sampler.shuffle - def on_validation_start(self, trainer, pl_module): - val_sampler = trainer.val_dataloaders[0].sampler - assert isinstance(val_sampler, DistributedSampler) - assert not val_sampler.shuffle + def on_test_start(self, trainer, pl_module): + test_sampler = trainer.test_dataloaders[0].sampler + assert isinstance(test_sampler, DistributedSampler) + assert not test_sampler.shuffle - def on_test_start(self, trainer, pl_module): - test_sampler = trainer.test_dataloaders[0].sampler - assert isinstance(test_sampler, DistributedSampler) - assert not test_sampler.shuffle - model = EvalModelTemplate() +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason='Test requires multiple GPUs') +def test_dataloader_distributed_sampler(tmpdir): + """ Test DistributedSampler and it's arguments for DDP backend """ + model = EvalModelTemplate() trainer = Trainer( gpus=[0, 1], num_nodes=1, distributed_backend='ddp_spawn', default_root_dir=tmpdir, max_steps=1, - callbacks=[DistribCallback()] + callbacks=[DistribSamplerCallback()] ) trainer.fit(model) trainer.test(ckpt_path=None) From 593814ac1fd663c796edcb78051af5d4790b4088 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sun, 2 Aug 2020 02:31:39 +0530 Subject: [PATCH 10/10] skip ddp test on windows --- tests/trainer/test_dataloaders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 0b471e1f2a94b..1c7e21b7a72bb 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -659,6 +659,7 @@ def on_test_start(self, trainer, pl_module): assert not test_sampler.shuffle +@pytest.mark.skipif(platform.system() == 'Windows', reason='Does not apply to Windows platform.') @pytest.mark.skipif(torch.cuda.device_count() < 2, reason='Test requires multiple GPUs') def test_dataloader_distributed_sampler(tmpdir): """ Test DistributedSampler and it's arguments for DDP backend """