From de9c9f0864418a83f295e4c87be50e12645bd83a Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 5 Aug 2020 22:34:49 +0530 Subject: [PATCH 01/39] Support limit_mode_batches (int) for infinite dataloader (#2787) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Support limit_mode_batches(int) for infinite dataloader * flake8 * revert and update * add and update tests * pep8 * chlog * Update CHANGELOG.md Co-authored-by: Adrian Wälchli * Add suggestions by @awaelchli * docs * Apply suggestions from code review Co-authored-by: Ethan Harris * Apply suggestions from code review * fix * max * check Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: Ethan Harris Co-authored-by: Jirka Borovec --- CHANGELOG.md | 2 + docs/source/sequences.rst | 21 ++++-- pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/trainer/data_loading.py | 45 ++++++----- pytorch_lightning/trainer/training_tricks.py | 2 +- tests/models/test_onnx_save.py | 2 +- tests/trainer/test_dataloaders.py | 79 ++++++++++++++++++-- 7 files changed, 112 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a56caac77a16f5..f1300140f229b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562)) +- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787)) + ### Changed - Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594)) diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index e24ee5bbca1cc9..b9a8f2ee642aad 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -49,8 +49,8 @@ Lightning can handle TBTT automatically via this flag. .. note:: If you need to modify how the batch is split, override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`. -.. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include - a `hiddens` arg. +.. note:: Using this feature requires updating your LightningModule's + :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg. ---------- @@ -59,10 +59,13 @@ Iterable Datasets Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural option when using sequential data. -.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or to an int - (specifying the number of training batches to run before validation) when initializing the Trainer. - This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate - the validation interval when val_check_interval is less than one. +.. note:: When using an IterableDataset you must set the ``val_check_interval`` to 1.0 (the default) or an int + (specifying the number of training batches to run before validation) when initializing the Trainer. This is + because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation + interval when ``val_check_interval`` is less than one. Similarly, you can set ``limit_{mode}_batches`` to a float or + an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches`` + to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception. + Here mode can be train/val/test. .. testcode:: @@ -87,3 +90,9 @@ option when using sequential data. # Set val_check_interval trainer = Trainer(val_check_interval=100) + + # Set limit_val_batches to 0.0 or 0 + trainer = Trainer(limit_val_batches=0.0) + + # Set limit_val_batches as an int + trainer = Trainer(limit_val_batches=100) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 36b3ec3229ee39..ea9898da5214e0 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1754,7 +1754,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg elif self.example_input_array is not None: input_data = self.example_input_array else: - raise ValueError(f'input_sample and example_input_array tensors are both missing.') + raise ValueError('`input_sample` and `example_input_array` tensors are both missing.') if 'example_outputs' not in kwargs: self.eval() diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 09186765c6eeec..4eec847580636f 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -212,18 +212,19 @@ def reset_train_dataloader(self, model: LightningModule) -> None: # automatically add samplers self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True) + self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf') self._worker_check(self.train_dataloader, 'train dataloader') self._check_batch_limits('limit_train_batches') - if not _has_len(self.train_dataloader): - self.num_training_batches = float('inf') - else: - # try getting the length - if isinstance(self.limit_train_batches, float): - self.num_training_batches = len(self.train_dataloader) - self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) - else: - self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches) + if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: + self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) + elif self.num_training_batches != float('inf'): + self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) + elif self.limit_train_batches != 1.0: + raise MisconfigurationException( + 'When using an IterableDataset for `limit_train_batches`,' + ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' + ' `num_training_batches` to use.') # determine when to check validation # if int passed in, val checks that often @@ -241,8 +242,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.val_check_batch = float('inf') else: raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset' - ' or when DataLoader does not implement `__len__`) for `train_dataloader`,' + 'When using an IterableDataset for `train_dataloader`,' ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies' ' checking validation every k training batches.') else: @@ -304,24 +304,21 @@ def _reset_eval_dataloader( for i, dataloader in enumerate(dataloaders): num_batches = len(dataloader) if _has_len(dataloader) else float('inf') self._worker_check(dataloader, f'{mode} dataloader {i}') + self._check_batch_limits(f'limit_{mode}_batches') # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') - if num_batches != float('inf'): - self._check_batch_limits(f'limit_{mode}_batches') - - # limit num batches either as a percent or num steps - if isinstance(limit_eval_batches, float): - num_batches = int(num_batches * limit_eval_batches) - else: - num_batches = min(len(dataloader), limit_eval_batches) - - elif limit_eval_batches not in (0.0, 1.0): + # limit num batches either as a percent or num steps + if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: + num_batches = min(num_batches, int(limit_eval_batches)) + elif num_batches != float('inf'): + num_batches = int(num_batches * limit_eval_batches) + elif limit_eval_batches != 1.0: raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset' - f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,' - f' `Trainer(limit_{mode}_batches)` must be `0.0` or `1.0`.') + 'When using an IterableDataset for `limit_{mode}_batches`,' + f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' + f' `num_{mode}_batches` to use.') if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index 20eeff3878cc20..5bea8fbc1a3cd6 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -269,7 +269,7 @@ def _adjust_batch_size(trainer, if hasattr(model, batch_arg_name): setattr(model, batch_arg_name, value) else: - setattr(model.hparams, batch_arg_name, value) + setattr(model.hparams, batch_arg_name, value) new_size = value if desc: log.info(f'Batch size {batch_size} {desc}, trying batch size {new_size}') diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx_save.py index f824f33c93bc14..7cb40561f77318 100644 --- a/tests/models/test_onnx_save.py +++ b/tests/models/test_onnx_save.py @@ -84,7 +84,7 @@ def test_error_if_no_input(tmpdir): model = EvalModelTemplate() model.example_input_array = None file_path = os.path.join(tmpdir, "model.onxx") - with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'): + with pytest.raises(ValueError, match=r'`input_sample` and `example_input_array` tensors are both missing'): model.to_onnx(file_path) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 1c7e21b7a72bb5..1aad5047855a2e 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -256,6 +256,69 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path): f'Multiple `test_dataloaders` not initiated properly, got {trainer.test_dataloaders}' +@pytest.mark.parametrize( + ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], + [ + pytest.param(0.0, 0.0, 0.0), + pytest.param(1.0, 1.0, 1.0), + ] +) +def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, + limit_val_batches, limit_test_batches): + """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent""" + model = EvalModelTemplate() + model.train_dataloader = model.train_dataloader__infinite + model.val_dataloader = model.val_dataloader__infinite + model.test_dataloader = model.test_dataloader__infinite + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=limit_train_batches, + limit_val_batches=limit_val_batches, + limit_test_batches=limit_test_batches, + ) + + results = trainer.fit(model) + assert results == 1 + assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf') + assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf') + + trainer.test(ckpt_path=None) + assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf') + + +@pytest.mark.parametrize( + ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], + [ + pytest.param(0, 0, 0), + pytest.param(10, 10, 10), + ] +) +def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): + """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number""" + model = EvalModelTemplate() + model.train_dataloader = model.train_dataloader__infinite + model.val_dataloader = model.val_dataloader__infinite + model.test_dataloader = model.test_dataloader__infinite + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=limit_train_batches, + limit_val_batches=limit_val_batches, + limit_test_batches=limit_test_batches, + ) + + results = trainer.fit(model) + assert results + assert trainer.num_training_batches == limit_train_batches + assert trainer.num_val_batches[0] == limit_val_batches + + trainer.test(ckpt_path=None) + assert trainer.num_test_batches[0] == limit_test_batches + + @pytest.mark.parametrize( ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], [ @@ -266,7 +329,7 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path): ] ) def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify num_batches for val & test dataloaders passed with batch limit in percent""" + """Verify num_batches for train, val & test dataloaders passed with batch limit in percent""" model = EvalModelTemplate() model.val_dataloader = model.val_dataloader__multiple_mixed_length model.test_dataloader = model.test_dataloader__multiple_mixed_length @@ -307,7 +370,7 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim ] ) def test_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify num_batches for val & test dataloaders passed with batch limit as number""" + """Verify num_batches for train, val & test dataloaders passed with batch limit as number""" os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() @@ -436,7 +499,7 @@ def test_train_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -447,7 +510,7 @@ def test_val_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -458,7 +521,7 @@ def test_test_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.test(model) @@ -774,7 +837,7 @@ def test_train_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, val_check_interval=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -785,7 +848,7 @@ def test_val_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_val_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -796,5 +859,5 @@ def test_test_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_test_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.test(model) From e31c520c21e1b2090d4cf889a2daac4188e9e2ce Mon Sep 17 00:00:00 2001 From: Ananya Harsh Jha Date: Wed, 5 Aug 2020 13:29:05 -0400 Subject: [PATCH 02/39] add support for sync_bn (#2801) * initial commit for sync_bn * updated changelog * tests * tests * ddp tests hanging with script tests * updated trainer * updated params * test * passingtests * passing tests * passing tests * passing tests * tests * removed apex * doc * doc * doc * doc * docs * tests * tests * tests --- CHANGELOG.md | 2 + pl_examples/basic_examples/sync_bn.py | 204 ++++++++++++++++++ pl_examples/test_examples.py | 14 ++ .../accelerator_backends/ddp_backend.py | 4 + .../accelerator_backends/ddp_spawn_backend.py | 4 + pytorch_lightning/core/lightning.py | 17 ++ pytorch_lightning/trainer/trainer.py | 6 + 7 files changed, 251 insertions(+) create mode 100644 pl_examples/basic_examples/sync_bn.py diff --git a/CHANGELOG.md b/CHANGELOG.md index f1300140f229b7..00fa8d1bf8985f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added SyncBN for DDP ([#2801](https://github.com/PyTorchLightning/pytorch-lightning/pull/2801)) + - Added SSIM metrics ([#2671](https://github.com/PyTorchLightning/pytorch-lightning/pull/2671)) - Added BLEU metrics ([#2535](https://github.com/PyTorchLightning/pytorch-lightning/pull/2535)) diff --git a/pl_examples/basic_examples/sync_bn.py b/pl_examples/basic_examples/sync_bn.py new file mode 100644 index 00000000000000..bb602a74ea89c9 --- /dev/null +++ b/pl_examples/basic_examples/sync_bn.py @@ -0,0 +1,204 @@ +""" +Sync-bn with DDP (GPU) + +This code is to verify that batch statistics are synchronized across GPUs using sync-bn. +When sync_bn is set to True the training loop should run for 3 iterations. +When sync_bn is set to False, the code should result in an AssertionError. +""" +import os +import math +import numpy as np +from argparse import ArgumentParser + +import torch +import torch.nn as nn +import torch.nn.functional as F +import pytorch_lightning as pl + +import torchvision.transforms as transforms +from torchvision.datasets import MNIST +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.distributed import DistributedSampler + + +pl.seed_everything(234) +FLOAT16_EPSILON = np.finfo(np.float16).eps + + +class MNISTDataModule(pl.LightningDataModule): + def __init__(self, data_dir: str = './', batch_size=32, dist_sampler=False): + super().__init__() + + self.dist_sampler = dist_sampler + self.data_dir = data_dir + self.batch_size = batch_size + + self.transforms = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + # self.dims is returned when you call dm.size() + # Setting default dims here because we know them. + # Could optionally be assigned dynamically in dm.setup() + self.dims = (1, 28, 28) + + def prepare_data(self): + # download only + MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()) + MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()) + + def setup(self, stage=None): + + # Assign train/val datasets for use in dataloaders + if stage == 'fit' or stage is None: + self.mnist_train = MNIST(self.data_dir, train=True, transform=self.transforms) + + # Assign test dataset for use in dataloader(s) + if stage == 'test' or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transforms) + + def train_dataloader(self): + dist_sampler = None + if self.dist_sampler: + dist_sampler = DistributedSampler(self.mnist_train, shuffle=False) + + return DataLoader( + self.mnist_train, batch_size=self.batch_size, sampler=dist_sampler, shuffle=False + ) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=self.batch_size, shuffle=False) + + +class SyncBNModule(pl.LightningModule): + def __init__(self, gpu_count=1, **kwargs): + super().__init__() + + self.gpu_count = gpu_count + self.bn_targets = None + if 'bn_targets' in kwargs: + self.bn_targets = kwargs['bn_targets'] + + self.linear = nn.Linear(28 * 28, 10) + self.bn_layer = nn.BatchNorm1d(28 * 28) + + def forward(self, x, batch_idx): + with torch.no_grad(): + out_bn = self.bn_layer(x.view(x.size(0), -1)) + + if self.bn_targets: + bn_target = self.bn_targets[batch_idx] + + # executes on both GPUs + bn_target = bn_target[self.trainer.local_rank::self.gpu_count] + bn_target = bn_target.to(out_bn.device) + assert torch.sum(torch.abs(bn_target - out_bn)) < FLOAT16_EPSILON + + out = self.linear(out_bn) + + return out, out_bn + + def training_step(self, batch, batch_idx): + x, y = batch + + y_hat, _ = self(x, batch_idx) + loss = F.cross_entropy(y_hat, y) + + return pl.TrainResult(loss) + + def configure_optimizers(self): + return torch.optim.Adam(self.linear.parameters(), lr=0.02) + + @staticmethod + def add_model_specific_argument(parent_parser, root_dir): + """ + Define parameters that only apply to this model + """ + parser = ArgumentParser(parents=[parent_parser]) + + parser.add_argument('--nodes', default=1, type=int) + parser.add_argument('--gpu', default=2, type=int) + parser.add_argument('--dist_backend', default='ddp', type=str) + + parser.add_argument('--epochs', default=1, type=int) + parser.add_argument('--steps', default=3, type=int) + + parser.add_argument('--bn_sync', action='store_true') + + return parser + + +def main(args, datamodule, bn_outputs): + """Main training routine specific for this project.""" + # ------------------------ + # 1 INIT LIGHTNING MODEL + # ------------------------ + model = SyncBNModule(gpu_count=args.gpu, bn_targets=bn_outputs) + + # ------------------------ + # 2 INIT TRAINER + # ------------------------ + trainer = pl.Trainer( + gpus=args.gpu, + num_nodes=args.nodes, + distributed_backend=args.dist_backend, + max_epochs=args.epochs, + max_steps=args.steps, + sync_bn=args.bn_sync, + num_sanity_val_steps=0, + replace_sampler_ddp=False, + ) + + # ------------------------ + # 3 START TRAINING + # ------------------------ + trainer.fit(model, datamodule) + + +def run_cli(): + root_dir = os.path.dirname(os.path.realpath(__file__)) + parent_parser = ArgumentParser(add_help=False) + + # define datamodule and dataloader + dm = MNISTDataModule() + dm.prepare_data() + dm.setup(stage=None) + + train_dataloader = dm.train_dataloader() + model = SyncBNModule() + + bn_outputs = [] + + # shuffle is false by default + for batch_idx, batch in enumerate(train_dataloader): + x, y = batch + + out, out_bn = model.forward(x, batch_idx) + bn_outputs.append(out_bn) + + # get 3 steps + if batch_idx == 2: + break + + bn_outputs = [x.cuda() for x in bn_outputs] + + # reset datamodule + # batch-size = 16 because 2 GPUs in DDP + dm = MNISTDataModule(batch_size=16, dist_sampler=True) + dm.prepare_data() + dm.setup(stage=None) + + # each LightningModule defines arguments relevant to it + parser = SyncBNModule.add_model_specific_argument(parent_parser, root_dir=root_dir) + parser = pl.Trainer.add_argparse_args(parser) + args = parser.parse_args() + + # --------------------- + # RUN TRAINING + # --------------------- + main(args, dm, bn_outputs) + + +if __name__ == '__main__': + run_cli() diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py index 330135e8ea78ab..d527354647f075 100644 --- a/pl_examples/test_examples.py +++ b/pl_examples/test_examples.py @@ -25,6 +25,20 @@ def test_gpu_template(cli_args): run_cli() +@pytest.mark.parametrize( + 'cli_args', + ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2 --dist_backend ddp_spawn --bn_sync'] +) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_sync_bn(cli_args): + """Test running CLI for an example with sync bn.""" + from pl_examples.basic_examples.sync_bn import run_cli + + cli_args = cli_args.split(' ') if cli_args else [] + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + run_cli() + + # @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2']) # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # def test_multi_node_ddp(cli_args): diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index 0b90a834746127..c2e549c18ef1a9 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -176,6 +176,10 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies + # call sync_bn before .cuda(), configure_apex and configure_ddp + if self.trainer.sync_bn: + model = model.configure_sync_bn(model) + # MODEL # copy model to each gpu if self.trainer.on_gpu: diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py index 8301cd8a71e162..85de2d1b7759ec 100644 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py @@ -118,6 +118,10 @@ def ddp_train(self, process_idx, mp_queue, model): self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies + # call sync_bn before .cuda(), configure_apex and configure_ddp + if self.trainer.sync_bn: + model = model.configure_sync_bn(model) + # MODEL # copy model to each gpu if self.trainer.on_gpu: diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index ea9898da5214e0..c09d981d1d5e3b 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -957,6 +957,23 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + def configure_sync_bn(self, model: 'LightningModule') -> 'LightningModule': + """ + Add global batchnorm for a model spread across multiple GPUs and nodes. + + Override to synchronize batchnorm between specific process groups instead + of the whole world or use a different sync_bn like `apex`'s version. + + Args: + model: pointer to current :class:`LightningModule`. + + Return: + LightningModule with batchnorm layers synchronized between process groups + """ + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) + + return model + def configure_apex( self, amp: object, model: 'LightningModule', optimizers: List[Optimizer], amp_level: str ) -> Tuple['LightningModule', List[Optimizer]]: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 323f2866b1cabf..ebfe680fc3372b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -184,6 +184,7 @@ def __init__( log_save_interval: int = 100, row_log_interval: int = 50, distributed_backend: Optional[str] = None, + sync_bn: bool = False, precision: int = 32, weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT, weights_save_path: Optional[str] = None, @@ -296,6 +297,8 @@ def __init__( distributed_backend: The distributed backend to use (dp, ddp, ddp2, ddp_spawn, ddp_cpu) + sync_bn: Synchronize batch norm layers between process groups/whole world. + precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs. weights_summary: Prints a summary of the weights when training begins. @@ -427,6 +430,9 @@ def __init__( self.num_nodes = num_nodes self.log_gpu_memory = log_gpu_memory + # sync-bn backend + self.sync_bn = sync_bn + self.gradient_clip_val = gradient_clip_val self.check_val_every_n_epoch = check_val_every_n_epoch From 2cbb1496d01213329b8f1c31936d26db8b2338b5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 5 Aug 2020 13:37:11 -0400 Subject: [PATCH 03/39] Update __init__.py --- pytorch_lightning/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 87d62347b47089..6b948f5e07528d 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,6 +1,6 @@ """Root package info.""" -__version__ = '0.9.0rc6' +__version__ = '0.9.0rc7' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' From 5bbcb8db1f1f9b8803e08085d8fe8975c89070e6 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Thu, 6 Aug 2020 00:10:11 +0630 Subject: [PATCH 04/39] Improve SSIM (#2833) * make ssim fast * remove padding * pep8 * add comments for readability * plus -> coef --- .../metrics/functional/regression.py | 30 +++++++++++------- pytorch_lightning/metrics/regression.py | 4 +-- tests/metrics/functional/test_regression.py | 31 ++++++++++--------- tests/metrics/test_regression.py | 2 +- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/pytorch_lightning/metrics/functional/regression.py b/pytorch_lightning/metrics/functional/regression.py index 68f7bef93f7ea0..84f50d37270ca8 100644 --- a/pytorch_lightning/metrics/functional/regression.py +++ b/pytorch_lightning/metrics/functional/regression.py @@ -236,9 +236,9 @@ def ssim( Example: >>> pred = torch.rand([16, 1, 16, 16]) - >>> target = pred * 1.25 + >>> target = pred * 0.75 >>> ssim(pred, target) - tensor(0.9520) + tensor(0.9219) """ if pred.dtype != target.dtype: @@ -280,16 +280,24 @@ def ssim( channel = pred.size(1) kernel = _gaussian_kernel(channel, kernel_size, sigma, device) - mu_pred = F.conv2d(pred, kernel, groups=channel) - mu_target = F.conv2d(target, kernel, groups=channel) - mu_pred_sq = mu_pred.pow(2) - mu_target_sq = mu_target.pow(2) - mu_pred_target = mu_pred * mu_target - - sigma_pred_sq = F.conv2d(pred * pred, kernel, groups=channel) - mu_pred_sq - sigma_target_sq = F.conv2d(target * target, kernel, groups=channel) - mu_target_sq - sigma_pred_target = F.conv2d(pred * target, kernel, groups=channel) - mu_pred_target + # Concatenate + # pred for mu_pred + # target for mu_target + # pred * pred for sigma_pred + # target * target for sigma_target + # pred * target for sigma_pred_target + input_list = torch.cat([pred, target, pred * pred, target * target, pred * target]) # (5 * B, C, H, W) + outputs = F.conv2d(input_list, kernel, groups=channel) + output_list = [outputs[x * pred.size(0): (x + 1) * pred.size(0)] for x in range(len(outputs))] + + mu_pred_sq = output_list[0].pow(2) + mu_target_sq = output_list[1].pow(2) + mu_pred_target = output_list[0] * output_list[1] + + sigma_pred_sq = output_list[2] - mu_pred_sq + sigma_target_sq = output_list[3] - mu_target_sq + sigma_pred_target = output_list[4] - mu_pred_target UPPER = 2 * sigma_pred_target + C2 LOWER = sigma_pred_sq + sigma_target_sq + C2 diff --git a/pytorch_lightning/metrics/regression.py b/pytorch_lightning/metrics/regression.py index 5b69868e1f776c..a2cbaaf4f822a9 100644 --- a/pytorch_lightning/metrics/regression.py +++ b/pytorch_lightning/metrics/regression.py @@ -241,10 +241,10 @@ class SSIM(Metric): Example: >>> pred = torch.rand([16, 1, 16, 16]) - >>> target = pred * 1.25 + >>> target = pred * 0.75 >>> metric = SSIM() >>> metric(pred, target) - tensor(0.9520) + tensor(0.9219) """ def __init__( diff --git a/tests/metrics/functional/test_regression.py b/tests/metrics/functional/test_regression.py index c9df4f1ba3b9e8..cd251c77a98fc1 100644 --- a/tests/metrics/functional/test_regression.py +++ b/tests/metrics/functional/test_regression.py @@ -97,24 +97,25 @@ def test_psnr_against_sklearn(sklearn_metric, torch_metric): assert torch.allclose(sk_score, pl_score) -@pytest.mark.parametrize(['size', 'channel', 'plus', 'multichannel'], [ - pytest.param(16, 1, 0.125, False), - pytest.param(32, 1, 0.25, False), - pytest.param(48, 3, 0.5, True), - pytest.param(64, 4, 0.75, True), - pytest.param(128, 5, 1, True) +@pytest.mark.parametrize(['size', 'channel', 'coef', 'multichannel'], [ + pytest.param(16, 1, 0.9, False), + pytest.param(32, 3, 0.8, True), + pytest.param(48, 4, 0.7, True), + pytest.param(64, 5, 0.6, True) ]) -def test_ssim(size, channel, plus, multichannel): +def test_ssim(size, channel, coef, multichannel): device = "cuda" if torch.cuda.is_available() else "cpu" - pred = torch.rand(1, channel, size, size, device=device) - target = pred + plus - ssim_idx = ssim(pred, target) - np_pred = np.random.rand(size, size, channel) + pred = torch.rand(size, channel, size, size, device=device) + target = pred * coef + ssim_idx = ssim(pred, target, data_range=1.0) + np_pred = pred.permute(0, 2, 3, 1).cpu().numpy() if multichannel is False: - np_pred = np_pred[:, :, 0] - np_target = np.add(np_pred, plus) - sk_ssim_idx = ski_ssim(np_pred, np_target, win_size=11, multichannel=multichannel, gaussian_weights=True) - assert torch.allclose(ssim_idx, torch.tensor(sk_ssim_idx, dtype=torch.float, device=device), atol=1e-2, rtol=1e-2) + np_pred = np_pred[:, :, :, 0] + np_target = np.multiply(np_pred, coef) + sk_ssim_idx = ski_ssim( + np_pred, np_target, win_size=11, multichannel=multichannel, gaussian_weights=True, data_range=1.0 + ) + assert torch.allclose(ssim_idx, torch.tensor(sk_ssim_idx, dtype=torch.float, device=device), atol=1e-4) ssim_idx = ssim(pred, pred) assert torch.allclose(ssim_idx, torch.tensor(1.0, device=device)) diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py index 955e6253e3225b..e5ecd51c775b90 100644 --- a/tests/metrics/test_regression.py +++ b/tests/metrics/test_regression.py @@ -65,6 +65,6 @@ def test_ssim(): assert ssim.name == 'ssim' pred = torch.rand([16, 1, 16, 16]) - target = pred * 1.25 + target = pred * 0.75 score = ssim(pred, target) assert isinstance(score, torch.Tensor) From 6034d5e37d508bda133e9cbf4d0f590c7d173f56 Mon Sep 17 00:00:00 2001 From: "Ruotian(RT) Luo" Date: Wed, 5 Aug 2020 12:42:21 -0500 Subject: [PATCH 05/39] fix apex gradient clipping (#2829) --- pytorch_lightning/trainer/training_loop.py | 4 ++-- pytorch_lightning/trainer/training_tricks.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index e0a7b43a872aa3..c8cb81ed090b19 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -291,7 +291,7 @@ def transfer_batch_to_tpu(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod - def clip_gradients(self): + def clip_gradients(self, *args): """Warning: this is just empty shell for code implemented in other class.""" @abstractmethod @@ -817,7 +817,7 @@ def run_batch_backward_pass(self, split_batch, batch_idx, opt_idx, optimizer): # ------------------ if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu: self.scaler.unscale_(optimizer) - self.clip_gradients() + self.clip_gradients(optimizer) # ------------------ # .STEP + ZERO_GRAD diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index 5bea8fbc1a3cd6..44b66407c7645d 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -27,9 +27,17 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.callbacks import GradientAccumulationScheduler from pytorch_lightning.loggers.base import DummyLogger +from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.memory import is_oom_error, garbage_collection_cuda +try: + from apex import amp +except ImportError: + APEX_AVAILABLE = False +else: + APEX_AVAILABLE = True + EPSILON = 1e-6 EPSILON_FP16 = 1e-5 @@ -60,14 +68,17 @@ def restore(self, *args): def fit(self, *args): """Warning: this is just empty shell for code implemented in other class.""" - def clip_gradients(self): + def clip_gradients(self, optimizer): # this code is a modification of torch.nn.utils.clip_grad_norm_ # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md if self.gradient_clip_val <= 0: return model = self.get_model() - parameters = model.parameters() + if self.use_amp and not NATIVE_AMP_AVALAIBLE: + parameters = amp.master_params(optimizer) + else: + parameters = model.parameters() max_norm = float(self.gradient_clip_val) norm_type = float(2.0) if isinstance(parameters, torch.Tensor): From bef27c58eda4c4425c8aa750d38e16522bfcbe39 Mon Sep 17 00:00:00 2001 From: "Ruotian(RT) Luo" Date: Wed, 5 Aug 2020 12:43:50 -0500 Subject: [PATCH 06/39] save apex scaler states (#2828) --- pytorch_lightning/trainer/training_io.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 90f36ecab59c32..666dfbb2588b72 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -115,6 +115,13 @@ else: XLA_AVAILABLE = True +try: + from apex import amp +except ImportError: + APEX_AVAILABLE = False +else: + APEX_AVAILABLE = True + try: import horovod.torch as hvd except (ModuleNotFoundError, ImportError): @@ -317,6 +324,8 @@ def restore(self, checkpoint_path: str, on_gpu: bool): # restore amp scaling if self.use_amp and NATIVE_AMP_AVALAIBLE and 'native_amp_scaling_state' in checkpoint: self.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) + elif self.use_amp and not NATIVE_AMP_AVALAIBLE and 'amp_scaling_state' in checkpoint: + amp.load_state_dict(checkpoint['amp_scaling_state']) # load training state (affects trainer only) self.restore_training_state(checkpoint) @@ -368,6 +377,8 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: # save native amp scaling if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu: checkpoint['native_amp_scaling_state'] = self.scaler.state_dict() + elif self.use_amp and not NATIVE_AMP_AVALAIBLE: + checkpoint['amp_scaling_state'] = amp.state_dict() # add the module_arguments and state_dict from the model model = self.get_model() @@ -523,6 +534,8 @@ def hpc_load(self, folderpath, on_gpu): # restore amp scaling if self.use_amp and NATIVE_AMP_AVALAIBLE and 'native_amp_scaling_state' in checkpoint: self.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) + elif self.use_amp and not NATIVE_AMP_AVALAIBLE and 'amp_scaling_state' in checkpoint: + amp.load_state_dict(checkpoint['amp_scaling_state']) if self.root_gpu is not None: model.cuda(self.root_gpu) From d09098ca5af65f200be2ff2a9a97d6204beff4c7 Mon Sep 17 00:00:00 2001 From: Rosario Scalise Date: Wed, 5 Aug 2020 12:06:26 -0700 Subject: [PATCH 07/39] [DOCS] title clarification in Results page (#2827) * title tweak * remove changes in new-project Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- docs/source/results.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/results.rst b/docs/source/results.rst index a5b01ee42e635b..960cda2bcf399f 100644 --- a/docs/source/results.rst +++ b/docs/source/results.rst @@ -40,8 +40,8 @@ using the equivalent syntax via the `TrainResult` object: -------------------- -Validation loop example ------------------------ +Validation/Test loop example +----------------------------- We can replace the following validation/test loop: .. code-block:: python From 2242af11b677395898951ef620ad65c64d7603a8 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 5 Aug 2020 21:43:33 +0200 Subject: [PATCH 08/39] another try to filter master from CircleCI jobs (#2734) * circleci config * Apply suggestions from code review * miss --- .circleci/config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9d287432327113..fdb217baff47eb 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -153,11 +153,11 @@ workflows: filters: branches: # https://discuss.circleci.com/t/create-separate-steps-jobs-for-pr-forks-versus-branches/13419/4 - only: - # only from forks - - /^pull\/.*$/ - # only from canonical repository - - /^(?!pull\/).*$/ + #only: + # # only from forks + # - /^pull\/.\d+$/ + ignore: + - master cleanup: triggers: - schedule: From 5d0f0325d854a901fbde85fab909cd866b30fc7c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 5 Aug 2020 15:57:26 -0400 Subject: [PATCH 09/39] Revert "Support limit_mode_batches (int) for infinite dataloader" (#2839) * Revert "Support limit_mode_batches (int) for infinite dataloader (#2787)" This reverts commit de9c9f0864418a83f295e4c87be50e12645bd83a. * Update training_tricks.py --- CHANGELOG.md | 2 - docs/source/sequences.rst | 21 ++---- pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/trainer/data_loading.py | 45 +++++++------ tests/models/test_onnx_save.py | 2 +- tests/trainer/test_dataloaders.py | 79 +++-------------------- 6 files changed, 40 insertions(+), 111 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00fa8d1bf8985f..2e800c28964ff8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,8 +33,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562)) -- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787)) - ### Changed - Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594)) diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index b9a8f2ee642aad..e24ee5bbca1cc9 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -49,8 +49,8 @@ Lightning can handle TBTT automatically via this flag. .. note:: If you need to modify how the batch is split, override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`. -.. note:: Using this feature requires updating your LightningModule's - :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg. +.. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include + a `hiddens` arg. ---------- @@ -59,13 +59,10 @@ Iterable Datasets Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural option when using sequential data. -.. note:: When using an IterableDataset you must set the ``val_check_interval`` to 1.0 (the default) or an int - (specifying the number of training batches to run before validation) when initializing the Trainer. This is - because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation - interval when ``val_check_interval`` is less than one. Similarly, you can set ``limit_{mode}_batches`` to a float or - an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches`` - to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception. - Here mode can be train/val/test. +.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or to an int + (specifying the number of training batches to run before validation) when initializing the Trainer. + This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate + the validation interval when val_check_interval is less than one. .. testcode:: @@ -90,9 +87,3 @@ option when using sequential data. # Set val_check_interval trainer = Trainer(val_check_interval=100) - - # Set limit_val_batches to 0.0 or 0 - trainer = Trainer(limit_val_batches=0.0) - - # Set limit_val_batches as an int - trainer = Trainer(limit_val_batches=100) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index c09d981d1d5e3b..80081c0dd446f6 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg elif self.example_input_array is not None: input_data = self.example_input_array else: - raise ValueError('`input_sample` and `example_input_array` tensors are both missing.') + raise ValueError(f'input_sample and example_input_array tensors are both missing.') if 'example_outputs' not in kwargs: self.eval() diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 4eec847580636f..09186765c6eeec 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -212,19 +212,18 @@ def reset_train_dataloader(self, model: LightningModule) -> None: # automatically add samplers self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True) - self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf') self._worker_check(self.train_dataloader, 'train dataloader') self._check_batch_limits('limit_train_batches') - if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: - self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) - elif self.num_training_batches != float('inf'): - self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) - elif self.limit_train_batches != 1.0: - raise MisconfigurationException( - 'When using an IterableDataset for `limit_train_batches`,' - ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' - ' `num_training_batches` to use.') + if not _has_len(self.train_dataloader): + self.num_training_batches = float('inf') + else: + # try getting the length + if isinstance(self.limit_train_batches, float): + self.num_training_batches = len(self.train_dataloader) + self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) + else: + self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches) # determine when to check validation # if int passed in, val checks that often @@ -242,7 +241,8 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.val_check_batch = float('inf') else: raise MisconfigurationException( - 'When using an IterableDataset for `train_dataloader`,' + 'When using an infinite DataLoader (e.g. with an IterableDataset' + ' or when DataLoader does not implement `__len__`) for `train_dataloader`,' ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies' ' checking validation every k training batches.') else: @@ -304,21 +304,24 @@ def _reset_eval_dataloader( for i, dataloader in enumerate(dataloaders): num_batches = len(dataloader) if _has_len(dataloader) else float('inf') self._worker_check(dataloader, f'{mode} dataloader {i}') - self._check_batch_limits(f'limit_{mode}_batches') # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') - # limit num batches either as a percent or num steps - if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: - num_batches = min(num_batches, int(limit_eval_batches)) - elif num_batches != float('inf'): - num_batches = int(num_batches * limit_eval_batches) - elif limit_eval_batches != 1.0: + if num_batches != float('inf'): + self._check_batch_limits(f'limit_{mode}_batches') + + # limit num batches either as a percent or num steps + if isinstance(limit_eval_batches, float): + num_batches = int(num_batches * limit_eval_batches) + else: + num_batches = min(len(dataloader), limit_eval_batches) + + elif limit_eval_batches not in (0.0, 1.0): raise MisconfigurationException( - 'When using an IterableDataset for `limit_{mode}_batches`,' - f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' - f' `num_{mode}_batches` to use.') + 'When using an infinite DataLoader (e.g. with an IterableDataset' + f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,' + f' `Trainer(limit_{mode}_batches)` must be `0.0` or `1.0`.') if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx_save.py index 7cb40561f77318..f824f33c93bc14 100644 --- a/tests/models/test_onnx_save.py +++ b/tests/models/test_onnx_save.py @@ -84,7 +84,7 @@ def test_error_if_no_input(tmpdir): model = EvalModelTemplate() model.example_input_array = None file_path = os.path.join(tmpdir, "model.onxx") - with pytest.raises(ValueError, match=r'`input_sample` and `example_input_array` tensors are both missing'): + with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'): model.to_onnx(file_path) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 1aad5047855a2e..1c7e21b7a72bb5 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -256,69 +256,6 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path): f'Multiple `test_dataloaders` not initiated properly, got {trainer.test_dataloaders}' -@pytest.mark.parametrize( - ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], - [ - pytest.param(0.0, 0.0, 0.0), - pytest.param(1.0, 1.0, 1.0), - ] -) -def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, - limit_val_batches, limit_test_batches): - """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent""" - model = EvalModelTemplate() - model.train_dataloader = model.train_dataloader__infinite - model.val_dataloader = model.val_dataloader__infinite - model.test_dataloader = model.test_dataloader__infinite - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=limit_train_batches, - limit_val_batches=limit_val_batches, - limit_test_batches=limit_test_batches, - ) - - results = trainer.fit(model) - assert results == 1 - assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf') - assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf') - - trainer.test(ckpt_path=None) - assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf') - - -@pytest.mark.parametrize( - ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], - [ - pytest.param(0, 0, 0), - pytest.param(10, 10, 10), - ] -) -def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number""" - model = EvalModelTemplate() - model.train_dataloader = model.train_dataloader__infinite - model.val_dataloader = model.val_dataloader__infinite - model.test_dataloader = model.test_dataloader__infinite - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=limit_train_batches, - limit_val_batches=limit_val_batches, - limit_test_batches=limit_test_batches, - ) - - results = trainer.fit(model) - assert results - assert trainer.num_training_batches == limit_train_batches - assert trainer.num_val_batches[0] == limit_val_batches - - trainer.test(ckpt_path=None) - assert trainer.num_test_batches[0] == limit_test_batches - - @pytest.mark.parametrize( ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], [ @@ -329,7 +266,7 @@ def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, lim ] ) def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify num_batches for train, val & test dataloaders passed with batch limit in percent""" + """Verify num_batches for val & test dataloaders passed with batch limit in percent""" model = EvalModelTemplate() model.val_dataloader = model.val_dataloader__multiple_mixed_length model.test_dataloader = model.test_dataloader__multiple_mixed_length @@ -370,7 +307,7 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim ] ) def test_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify num_batches for train, val & test dataloaders passed with batch limit as number""" + """Verify num_batches for val & test dataloaders passed with batch limit as number""" os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() @@ -499,7 +436,7 @@ def test_train_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5) - with pytest.raises(MisconfigurationException, match='using an IterableDataset'): + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.fit(model) @@ -510,7 +447,7 @@ def test_val_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5) - with pytest.raises(MisconfigurationException, match='using an IterableDataset'): + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.fit(model) @@ -521,7 +458,7 @@ def test_test_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=0.5) - with pytest.raises(MisconfigurationException, match='using an IterableDataset'): + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.test(model) @@ -837,7 +774,7 @@ def test_train_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, val_check_interval=0.5) - with pytest.raises(MisconfigurationException, match='using an IterableDataset'): + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.fit(model) @@ -848,7 +785,7 @@ def test_val_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_val_batches=0.5) - with pytest.raises(MisconfigurationException, match='using an IterableDataset'): + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.fit(model) @@ -859,5 +796,5 @@ def test_test_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_test_batches=0.5) - with pytest.raises(MisconfigurationException, match='using an IterableDataset'): + with pytest.raises(MisconfigurationException, match='infinite DataLoader'): trainer.test(model) From 633cf76c686357c88f2d6397fa316ed710004184 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 5 Aug 2020 15:58:27 -0400 Subject: [PATCH 10/39] Update __init__.py --- pytorch_lightning/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 6b948f5e07528d..f3bde1c7ee5738 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,6 +1,6 @@ """Root package info.""" -__version__ = '0.9.0rc7' +__version__ = '0.9.0rc8' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' From a5f2b89ed08172c25fd1cdc3d884d0fbb60bc45c Mon Sep 17 00:00:00 2001 From: Ananya Harsh Jha Date: Wed, 5 Aug 2020 19:12:11 -0400 Subject: [PATCH 11/39] updated sync bn (#2838) * updated sync bn * updated sync bn * updated sync bn * updated sync bn * updated sync bn * updated sync bn * updated sync bn * updated sync bn * added ddp_spawn test * updated test * clean * clean Co-authored-by: Jirka Borovec --- pl_examples/basic_examples/sync_bn.py | 204 ------------------ pl_examples/test_examples.py | 14 -- .../accelerator_backends/ddp_backend.py | 4 +- .../accelerator_backends/ddp_spawn_backend.py | 4 +- pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/trainer/__init__.py | 8 + pytorch_lightning/trainer/trainer.py | 6 +- tests/base/datamodules.py | 48 ++++- tests/models/test_sync_batchnorm.py | 100 +++++++++ 9 files changed, 163 insertions(+), 227 deletions(-) delete mode 100644 pl_examples/basic_examples/sync_bn.py create mode 100644 tests/models/test_sync_batchnorm.py diff --git a/pl_examples/basic_examples/sync_bn.py b/pl_examples/basic_examples/sync_bn.py deleted file mode 100644 index bb602a74ea89c9..00000000000000 --- a/pl_examples/basic_examples/sync_bn.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -Sync-bn with DDP (GPU) - -This code is to verify that batch statistics are synchronized across GPUs using sync-bn. -When sync_bn is set to True the training loop should run for 3 iterations. -When sync_bn is set to False, the code should result in an AssertionError. -""" -import os -import math -import numpy as np -from argparse import ArgumentParser - -import torch -import torch.nn as nn -import torch.nn.functional as F -import pytorch_lightning as pl - -import torchvision.transforms as transforms -from torchvision.datasets import MNIST -from torch.utils.data import DataLoader, Dataset -from torch.utils.data.distributed import DistributedSampler - - -pl.seed_everything(234) -FLOAT16_EPSILON = np.finfo(np.float16).eps - - -class MNISTDataModule(pl.LightningDataModule): - def __init__(self, data_dir: str = './', batch_size=32, dist_sampler=False): - super().__init__() - - self.dist_sampler = dist_sampler - self.data_dir = data_dir - self.batch_size = batch_size - - self.transforms = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) - - # self.dims is returned when you call dm.size() - # Setting default dims here because we know them. - # Could optionally be assigned dynamically in dm.setup() - self.dims = (1, 28, 28) - - def prepare_data(self): - # download only - MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()) - MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor()) - - def setup(self, stage=None): - - # Assign train/val datasets for use in dataloaders - if stage == 'fit' or stage is None: - self.mnist_train = MNIST(self.data_dir, train=True, transform=self.transforms) - - # Assign test dataset for use in dataloader(s) - if stage == 'test' or stage is None: - self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transforms) - - def train_dataloader(self): - dist_sampler = None - if self.dist_sampler: - dist_sampler = DistributedSampler(self.mnist_train, shuffle=False) - - return DataLoader( - self.mnist_train, batch_size=self.batch_size, sampler=dist_sampler, shuffle=False - ) - - def test_dataloader(self): - return DataLoader(self.mnist_test, batch_size=self.batch_size, shuffle=False) - - -class SyncBNModule(pl.LightningModule): - def __init__(self, gpu_count=1, **kwargs): - super().__init__() - - self.gpu_count = gpu_count - self.bn_targets = None - if 'bn_targets' in kwargs: - self.bn_targets = kwargs['bn_targets'] - - self.linear = nn.Linear(28 * 28, 10) - self.bn_layer = nn.BatchNorm1d(28 * 28) - - def forward(self, x, batch_idx): - with torch.no_grad(): - out_bn = self.bn_layer(x.view(x.size(0), -1)) - - if self.bn_targets: - bn_target = self.bn_targets[batch_idx] - - # executes on both GPUs - bn_target = bn_target[self.trainer.local_rank::self.gpu_count] - bn_target = bn_target.to(out_bn.device) - assert torch.sum(torch.abs(bn_target - out_bn)) < FLOAT16_EPSILON - - out = self.linear(out_bn) - - return out, out_bn - - def training_step(self, batch, batch_idx): - x, y = batch - - y_hat, _ = self(x, batch_idx) - loss = F.cross_entropy(y_hat, y) - - return pl.TrainResult(loss) - - def configure_optimizers(self): - return torch.optim.Adam(self.linear.parameters(), lr=0.02) - - @staticmethod - def add_model_specific_argument(parent_parser, root_dir): - """ - Define parameters that only apply to this model - """ - parser = ArgumentParser(parents=[parent_parser]) - - parser.add_argument('--nodes', default=1, type=int) - parser.add_argument('--gpu', default=2, type=int) - parser.add_argument('--dist_backend', default='ddp', type=str) - - parser.add_argument('--epochs', default=1, type=int) - parser.add_argument('--steps', default=3, type=int) - - parser.add_argument('--bn_sync', action='store_true') - - return parser - - -def main(args, datamodule, bn_outputs): - """Main training routine specific for this project.""" - # ------------------------ - # 1 INIT LIGHTNING MODEL - # ------------------------ - model = SyncBNModule(gpu_count=args.gpu, bn_targets=bn_outputs) - - # ------------------------ - # 2 INIT TRAINER - # ------------------------ - trainer = pl.Trainer( - gpus=args.gpu, - num_nodes=args.nodes, - distributed_backend=args.dist_backend, - max_epochs=args.epochs, - max_steps=args.steps, - sync_bn=args.bn_sync, - num_sanity_val_steps=0, - replace_sampler_ddp=False, - ) - - # ------------------------ - # 3 START TRAINING - # ------------------------ - trainer.fit(model, datamodule) - - -def run_cli(): - root_dir = os.path.dirname(os.path.realpath(__file__)) - parent_parser = ArgumentParser(add_help=False) - - # define datamodule and dataloader - dm = MNISTDataModule() - dm.prepare_data() - dm.setup(stage=None) - - train_dataloader = dm.train_dataloader() - model = SyncBNModule() - - bn_outputs = [] - - # shuffle is false by default - for batch_idx, batch in enumerate(train_dataloader): - x, y = batch - - out, out_bn = model.forward(x, batch_idx) - bn_outputs.append(out_bn) - - # get 3 steps - if batch_idx == 2: - break - - bn_outputs = [x.cuda() for x in bn_outputs] - - # reset datamodule - # batch-size = 16 because 2 GPUs in DDP - dm = MNISTDataModule(batch_size=16, dist_sampler=True) - dm.prepare_data() - dm.setup(stage=None) - - # each LightningModule defines arguments relevant to it - parser = SyncBNModule.add_model_specific_argument(parent_parser, root_dir=root_dir) - parser = pl.Trainer.add_argparse_args(parser) - args = parser.parse_args() - - # --------------------- - # RUN TRAINING - # --------------------- - main(args, dm, bn_outputs) - - -if __name__ == '__main__': - run_cli() diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py index d527354647f075..330135e8ea78ab 100644 --- a/pl_examples/test_examples.py +++ b/pl_examples/test_examples.py @@ -25,20 +25,6 @@ def test_gpu_template(cli_args): run_cli() -@pytest.mark.parametrize( - 'cli_args', - ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2 --dist_backend ddp_spawn --bn_sync'] -) -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") -def test_sync_bn(cli_args): - """Test running CLI for an example with sync bn.""" - from pl_examples.basic_examples.sync_bn import run_cli - - cli_args = cli_args.split(' ') if cli_args else [] - with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - run_cli() - - # @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2']) # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # def test_multi_node_ddp(cli_args): diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py index c2e549c18ef1a9..44ad52d34ba2f9 100644 --- a/pytorch_lightning/accelerator_backends/ddp_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_backend.py @@ -177,8 +177,8 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 self.trainer.optimizer_frequencies = optimizer_frequencies # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_bn: - model = model.configure_sync_bn(model) + if self.trainer.sync_batchnorm: + model = model.configure_sync_batchnorm(model) # MODEL # copy model to each gpu diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py index 85de2d1b7759ec..704fc5558588a4 100644 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py @@ -119,8 +119,8 @@ def ddp_train(self, process_idx, mp_queue, model): self.trainer.optimizer_frequencies = optimizer_frequencies # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_bn: - model = model.configure_sync_bn(model) + if self.trainer.sync_batchnorm: + model = model.configure_sync_batchnorm(model) # MODEL # copy model to each gpu diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 80081c0dd446f6..d272c23fd9a659 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -957,7 +957,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - def configure_sync_bn(self, model: 'LightningModule') -> 'LightningModule': + def configure_sync_batchnorm(self, model: 'LightningModule') -> 'LightningModule': """ Add global batchnorm for a model spread across multiple GPUs and nodes. diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 0164210c771fbd..8dcec8eb305110 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -855,6 +855,14 @@ def on_train_end(self, trainer, pl_module): # default used by the Trainer trainer = Trainer(row_log_interval=50) +sync_batchnorm +^^^^^^^^^^^^^^ + +Enable synchronization between batchnorm layers across all GPUs. + +.. testcode:: + + trainer = Trainer(sync_batchnorm=True) val_percent_check ^^^^^^^^^^^^^^^^^ diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ebfe680fc3372b..4b342328df2979 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -184,7 +184,7 @@ def __init__( log_save_interval: int = 100, row_log_interval: int = 50, distributed_backend: Optional[str] = None, - sync_bn: bool = False, + sync_batchnorm: bool = False, precision: int = 32, weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT, weights_save_path: Optional[str] = None, @@ -297,7 +297,7 @@ def __init__( distributed_backend: The distributed backend to use (dp, ddp, ddp2, ddp_spawn, ddp_cpu) - sync_bn: Synchronize batch norm layers between process groups/whole world. + sync_batchnorm: Synchronize batch norm layers between process groups/whole world. precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs. @@ -431,7 +431,7 @@ def __init__( self.log_gpu_memory = log_gpu_memory # sync-bn backend - self.sync_bn = sync_bn + self.sync_batchnorm = sync_batchnorm self.gradient_clip_val = gradient_clip_val self.check_val_every_n_epoch = check_val_every_n_epoch diff --git a/tests/base/datamodules.py b/tests/base/datamodules.py index a55a9a718ea9d2..d1f7fabf8d6b46 100644 --- a/tests/base/datamodules.py +++ b/tests/base/datamodules.py @@ -1,7 +1,9 @@ +import os from torch.utils.data import random_split, DataLoader from pytorch_lightning.core.datamodule import LightningDataModule -from tests.base.datasets import TrialMNIST +from tests.base.datasets import TrialMNIST, MNIST +from torch.utils.data.distributed import DistributedSampler class TrialMNISTDataModule(LightningDataModule): @@ -36,3 +38,47 @@ def val_dataloader(self): def test_dataloader(self): return DataLoader(self.mnist_test, batch_size=32) + + +class MNISTDataModule(LightningDataModule): + def __init__( + self, data_dir: str = './', batch_size: int = 32, dist_sampler: bool = False + ) -> None: + super().__init__() + + self.dist_sampler = dist_sampler + self.data_dir = data_dir + self.batch_size = batch_size + + # self.dims is returned when you call dm.size() + # Setting default dims here because we know them. + # Could optionally be assigned dynamically in dm.setup() + self.dims = (1, 28, 28) + + def prepare_data(self): + # download only + MNIST(self.data_dir, train=True, download=True, normalize=(0.1307, 0.3081)) + MNIST(self.data_dir, train=False, download=True, normalize=(0.1307, 0.3081)) + + def setup(self, stage: str = None): + + # Assign train/val datasets for use in dataloaders + # TODO: need to split using random_split once updated to torch >= 1.6 + if stage == 'fit' or stage is None: + self.mnist_train = MNIST(self.data_dir, train=True, normalize=(0.1307, 0.3081)) + + # Assign test dataset for use in dataloader(s) + if stage == 'test' or stage is None: + self.mnist_test = MNIST(self.data_dir, train=False, normalize=(0.1307, 0.3081)) + + def train_dataloader(self): + dist_sampler = None + if self.dist_sampler: + dist_sampler = DistributedSampler(self.mnist_train, shuffle=False) + + return DataLoader( + self.mnist_train, batch_size=self.batch_size, sampler=dist_sampler, shuffle=False + ) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=self.batch_size, shuffle=False) diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py new file mode 100644 index 00000000000000..5aff30d0aacbd9 --- /dev/null +++ b/tests/models/test_sync_batchnorm.py @@ -0,0 +1,100 @@ +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pytorch_lightning import Trainer, seed_everything, LightningModule, TrainResult +from pytorch_lightning.utilities import FLOAT16_EPSILON +from tests.base.datamodules import MNISTDataModule +from tests.base.develop_utils import set_random_master_port + + +class SyncBNModule(LightningModule): + def __init__(self, gpu_count=1, **kwargs): + super().__init__() + + self.gpu_count = gpu_count + self.bn_targets = None + if 'bn_targets' in kwargs: + self.bn_targets = kwargs['bn_targets'] + + self.linear = nn.Linear(28 * 28, 10) + self.bn_layer = nn.BatchNorm1d(28 * 28) + + def forward(self, x, batch_idx): + with torch.no_grad(): + out_bn = self.bn_layer(x.view(x.size(0), -1)) + + if self.bn_targets: + bn_target = self.bn_targets[batch_idx] + + # executes on both GPUs + bn_target = bn_target[self.trainer.local_rank::self.gpu_count] + bn_target = bn_target.to(out_bn.device) + assert torch.sum(torch.abs(bn_target - out_bn)) < FLOAT16_EPSILON + + out = self.linear(out_bn) + + return out, out_bn + + def training_step(self, batch, batch_idx): + x, y = batch + + y_hat, _ = self(x, batch_idx) + loss = F.cross_entropy(y_hat, y) + + return TrainResult(loss) + + def configure_optimizers(self): + return torch.optim.Adam(self.linear.parameters(), lr=0.02) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_sync_batchnorm_ddp(tmpdir): + seed_everything(234) + set_random_master_port() + + # define datamodule and dataloader + dm = MNISTDataModule() + dm.prepare_data() + dm.setup(stage=None) + + train_dataloader = dm.train_dataloader() + model = SyncBNModule() + + bn_outputs = [] + + # shuffle is false by default + for batch_idx, batch in enumerate(train_dataloader): + x, _ = batch + + _, out_bn = model.forward(x, batch_idx) + bn_outputs.append(out_bn) + + # get 3 steps + if batch_idx == 2: + break + + bn_outputs = [x.cuda() for x in bn_outputs] + + # reset datamodule + # batch-size = 16 because 2 GPUs in DDP + dm = MNISTDataModule(batch_size=16, dist_sampler=True) + dm.prepare_data() + dm.setup(stage=None) + + model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs) + + trainer = Trainer( + gpus=2, + num_nodes=1, + distributed_backend='ddp_spawn', + max_epochs=1, + max_steps=3, + sync_batchnorm=True, + num_sanity_val_steps=0, + replace_sampler_ddp=False, + ) + + result = trainer.fit(model, dm) + assert result == 1, "Sync batchnorm failing with DDP" From b507c42c478e7b99acb37f1fe9a0bf72285a1b17 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 5 Aug 2020 20:01:30 -0400 Subject: [PATCH 12/39] clarify batch hooks (#2842) * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook * modified hook --- pytorch_lightning/callbacks/base.py | 8 ++++++++ pytorch_lightning/callbacks/lr_logger.py | 2 +- pytorch_lightning/callbacks/progress.py | 10 +++++----- pytorch_lightning/core/hooks.py | 21 +++++++++++++++++++++ pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/trainer/callback_hook.py | 12 +++++++++++- pytorch_lightning/trainer/lr_finder.py | 2 +- pytorch_lightning/trainer/training_loop.py | 19 +++++++++++++++++++ tests/callbacks/test_callbacks.py | 16 ++++++++++++++++ tests/callbacks/test_progress_bar.py | 8 ++++---- tests/core/test_datamodules.py | 6 +++--- tests/loggers/test_all.py | 2 +- tests/trainer/test_trainer.py | 4 ++-- tests/utilities/test_dtype_device_mixin.py | 2 +- 14 files changed, 94 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py index a9c6e1fb520cb2..82a0e6b0436a65 100644 --- a/pytorch_lightning/callbacks/base.py +++ b/pytorch_lightning/callbacks/base.py @@ -46,6 +46,14 @@ def on_sanity_check_end(self, trainer, pl_module): """Called when the validation sanity check ends.""" pass + def on_train_batch_start(self, trainer, pl_module): + """Called when the validation batch begins.""" + pass + + def on_train_batch_end(self, trainer, pl_module): + """Called when the validation batch ends.""" + pass + def on_train_epoch_start(self, trainer, pl_module): """Called when the train epoch begins.""" pass diff --git a/pytorch_lightning/callbacks/lr_logger.py b/pytorch_lightning/callbacks/lr_logger.py index 87953d496b3ad9..7ec73b8c888119 100755 --- a/pytorch_lightning/callbacks/lr_logger.py +++ b/pytorch_lightning/callbacks/lr_logger.py @@ -64,7 +64,7 @@ def on_train_start(self, trainer, pl_module): # Initialize for storing values self.lrs = {name: [] for name in names} - def on_batch_start(self, trainer, pl_module): + def on_train_batch_start(self, trainer, pl_module): latest_stat = self._extract_lr(trainer, 'step') if trainer.logger and latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.global_step) diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py index 0acdbcc7509eab..4ab990f74724e8 100644 --- a/pytorch_lightning/callbacks/progress.py +++ b/pytorch_lightning/callbacks/progress.py @@ -36,8 +36,8 @@ def __init__(self): def disable(self): self.enable = False - def on_batch_end(self, trainer, pl_module): - super().on_batch_end(trainer, pl_module) # don't forget this :) + def on_train_batch_end(self, trainer, pl_module): + super().on_train_batch_end(trainer, pl_module) # don't forget this :) percent = (self.train_batch_idx / self.total_train_batches) * 100 sys.stdout.flush() sys.stdout.write(f'{percent:.01f} percent complete \r') @@ -138,7 +138,7 @@ def on_train_start(self, trainer, pl_module): def on_epoch_start(self, trainer, pl_module): self._train_batch_idx = 0 - def on_batch_end(self, trainer, pl_module): + def on_train_batch_end(self, trainer, pl_module): self._train_batch_idx += 1 def on_validation_start(self, trainer, pl_module): @@ -318,8 +318,8 @@ def on_epoch_start(self, trainer, pl_module): self.main_progress_bar.reset(convert_inf(total_batches)) self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch + 1}') - def on_batch_end(self, trainer, pl_module): - super().on_batch_end(trainer, pl_module) + def on_train_batch_end(self, trainer, pl_module): + super().on_train_batch_end(trainer, pl_module) if self.is_enabled and self.train_batch_idx % self.refresh_rate == 0: self.main_progress_bar.update(self.refresh_rate) self.main_progress_bar.set_postfix(trainer.progress_bar_dict) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 8c6b726ac31d27..1218dcbe6760fd 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -77,6 +77,23 @@ def on_train_end(self) -> None: """ # do something at the end of training + def on_train_batch_start(self, batch: Any) -> None: + """ + Called in the training loop before anything happens for that batch. + + If you return -1 here, you will skip training for the rest of the current epoch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + """ + # do something when the batch starts + + def on_train_batch_end(self) -> None: + """ + Called in the training loop after the batch. + """ + # do something when the batch end + def on_batch_start(self, batch: Any) -> None: """ Called in the training loop before anything happens for that batch. @@ -85,12 +102,16 @@ def on_batch_start(self, batch: Any) -> None: Args: batch: The batched data as it is returned by the training DataLoader. + + .. warning:: Deprecated in 0.9.0 will remove 1.0.0 (use `on_train_batch_start` instead) """ # do something when the batch starts def on_batch_end(self) -> None: """ Called in the training loop after the batch. + + .. warning:: Deprecated in 0.9.0 will remove 1.0.0 (use `on_train_batch_end` instead) """ # do something when the batch ends diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d272c23fd9a659..f816726ddf1e17 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg elif self.example_input_array is not None: input_data = self.example_input_array else: - raise ValueError(f'input_sample and example_input_array tensors are both missing.') + raise ValueError('input_sample and example_input_array tensors are both missing.') if 'example_outputs' not in kwargs: self.eval() diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 89b5e712c91909..7c627434553172 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -9,7 +9,7 @@ class TrainerCallbackHookMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class callbacks: List[Callback] = [] - get_model: Callable = ... + get_model: Callable def setup(self, stage: str): """Called in the beginning of fit and test""" @@ -111,6 +111,16 @@ def on_batch_end(self): for callback in self.callbacks: callback.on_batch_end(self, self.get_model()) + def on_train_batch_start(self): + """Called when the training batch begins.""" + for callback in self.callbacks: + callback.on_train_batch_start(self, self.get_model()) + + def on_train_batch_end(self): + """Called when the training batch ends.""" + for callback in self.callbacks: + callback.on_train_batch_end(self, self.get_model()) + def on_validation_batch_start(self): """Called when the validation batch begins.""" for callback in self.callbacks: diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py index 3b2778d24071c4..23ad702956e848 100755 --- a/pytorch_lightning/trainer/lr_finder.py +++ b/pytorch_lightning/trainer/lr_finder.py @@ -382,7 +382,7 @@ def on_batch_start(self, trainer, pl_module): self.lrs.append(trainer.lr_schedulers[0]['scheduler'].lr[0]) - def on_batch_end(self, trainer, pl_module): + def on_train_batch_end(self, trainer, pl_module): """ Called when the training batch ends, logs the calculated loss """ if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0: return diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index c8cb81ed090b19..993e8ccd53fd08 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -263,6 +263,8 @@ class TrainerTrainLoopMixin(ABC): on_train_end: Callable on_batch_start: Callable on_batch_end: Callable + on_train_batch_start: Callable + on_train_batch_end: Callable on_epoch_start: Callable on_epoch_end: Callable on_validation_end: Callable @@ -690,6 +692,7 @@ def run_training_batch(self, batch, batch_idx): return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # Batch start events + # TODO: deprecate 1.0 with self.profiler.profile('on_batch_start'): # callbacks self.on_batch_start() @@ -699,6 +702,15 @@ def run_training_batch(self, batch, batch_idx): if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) + with self.profiler.profile('on_train_batch_start'): + # callbacks + self.on_train_batch_start() + # hooks + if self.is_function_implemented('on_train_batch_start'): + response = self.get_model().on_train_batch_start(batch) + if response == -1: + return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) + splits = [batch] if self.truncated_bptt_steps is not None: model_ref = self.get_model() @@ -785,6 +797,13 @@ def run_training_batch(self, batch, batch_idx): if self.is_function_implemented('on_batch_end'): self.get_model().on_batch_end() + with self.profiler.profile('on_train_batch_end'): + # callbacks + self.on_train_batch_end() + # model hooks + if self.is_function_implemented('on_train_batch_end'): + self.get_model().on_train_batch_end() + # collapse all metrics into one dict batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index d10965524394b2..83de82c71de679 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -28,6 +28,8 @@ def __init__(self): self.on_epoch_end_called = False self.on_batch_start_called = False self.on_batch_end_called = False + self.on_train_batch_start_called = False + self.on_train_batch_end_called = False self.on_validation_batch_start_called = False self.on_validation_batch_end_called = False self.on_test_batch_start_called = False @@ -87,6 +89,14 @@ def on_batch_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_batch_end_called = True + def on_train_batch_start(self, trainer, pl_module): + _check_args(trainer, pl_module) + self.on_train_batch_start_called = True + + def on_train_batch_end(self, trainer, pl_module): + _check_args(trainer, pl_module) + self.on_train_batch_end_called = True + def on_validation_batch_start(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_validation_batch_start_called = True @@ -150,6 +160,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_epoch_start_called assert not test_callback.on_batch_start_called assert not test_callback.on_batch_end_called + assert not test_callback.on_train_batch_start_called + assert not test_callback.on_train_batch_end_called assert not test_callback.on_validation_batch_start_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_test_batch_start_called @@ -177,6 +189,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_epoch_start_called assert not test_callback.on_batch_start_called assert not test_callback.on_batch_end_called + assert not test_callback.on_train_batch_start_called + assert not test_callback.on_train_batch_end_called assert not test_callback.on_validation_batch_start_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_test_batch_start_called @@ -202,6 +216,8 @@ def on_test_end(self, trainer, pl_module): assert test_callback.on_epoch_start_called assert test_callback.on_batch_start_called assert test_callback.on_batch_end_called + assert test_callback.on_train_batch_start_called + assert test_callback.on_train_batch_end_called assert test_callback.on_validation_batch_start_called assert test_callback.on_validation_batch_end_called assert test_callback.on_train_start_called diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py index 23743dc5dcb2cc..779077c437585c 100644 --- a/tests/callbacks/test_progress_bar.py +++ b/tests/callbacks/test_progress_bar.py @@ -153,12 +153,12 @@ class CurrentProgressBar(ProgressBar): val_batches_seen = 0 test_batches_seen = 0 - def on_batch_start(self, trainer, pl_module): - super().on_batch_start(trainer, pl_module) + def on_train_batch_start(self, trainer, pl_module): + super().on_train_batch_start(trainer, pl_module) assert self.train_batch_idx == trainer.batch_idx - def on_batch_end(self, trainer, pl_module): - super().on_batch_end(trainer, pl_module) + def on_train_batch_end(self, trainer, pl_module): + super().on_train_batch_end(trainer, pl_module) assert self.train_batch_idx == trainer.batch_idx + 1 if not self.is_disabled and self.train_batch_idx % self.refresh_rate == 0: assert self.main_progress_bar.n == self.train_batch_idx diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index ec66afb71ca22b..305f7f3d69150e 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -50,17 +50,17 @@ def test_can_prepare_data(tmpdir): # is_overridden prepare data = True # has been called - # False + # False dm._has_prepared_data = True assert not trainer.can_prepare_data() # has not been called - # True + # True dm._has_prepared_data = False assert trainer.can_prepare_data() # is_overridden prepare data = False - # True + # True dm.prepare_data = None assert trainer.can_prepare_data() diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 3afa1dd11c56c1..5bd81d7116948d 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -214,7 +214,7 @@ class RankZeroLoggerCheck(Callback): # this class has to be defined outside the test function, otherwise we get pickle error # due to the way ddp process is launched - def on_batch_start(self, trainer, pl_module): + def on_train_batch_start(self, trainer, pl_module): is_dummy = isinstance(trainer.logger.experiment, DummyExperiment) if trainer.is_global_zero: assert not is_dummy diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index c7652ebecf3f9a..3dbb7b7c079d64 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -377,7 +377,7 @@ def increment_on_load_checkpoint(self, _): # Bind methods to keep track of epoch numbers, batch numbers it has seen # as well as number of times it has called on_load_checkpoint() model.on_epoch_end = types.MethodType(increment_epoch, model) - model.on_batch_start = types.MethodType(increment_batch, model) + model.on_train_batch_start = types.MethodType(increment_batch, model) model.on_load_checkpoint = types.MethodType(increment_on_load_checkpoint, model) return model @@ -691,7 +691,7 @@ class InterruptCallback(Callback): def __init__(self): super().__init__() - def on_batch_start(self, trainer, pl_module): + def on_train_batch_start(self, trainer, pl_module): raise KeyboardInterrupt class HandleInterruptCallback(Callback): diff --git a/tests/utilities/test_dtype_device_mixin.py b/tests/utilities/test_dtype_device_mixin.py index f755cf5c634ed5..08f808bda9ceb0 100644 --- a/tests/utilities/test_dtype_device_mixin.py +++ b/tests/utilities/test_dtype_device_mixin.py @@ -27,7 +27,7 @@ def __init__(self, *args, **kwargs): class DeviceAssertCallback(Callback): - def on_batch_start(self, trainer, model): + def on_train_batch_start(self, trainer, model): rank = trainer.local_rank assert isinstance(model, TopModule) # index = None also means first device From fe29c53ab5eb16758ccc448716e1c365da5c1beb Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 6 Aug 2020 02:42:09 +0200 Subject: [PATCH 13/39] add ddp sync for logging in result step (#2822) * add ddp sync for logging in result step * pep8 * pep8 * make ddp tests run also on cpu (except windowws) * create class instance in ddp test * revert automated formatting * pep8 --- pytorch_lightning/core/step_result.py | 33 +++++++++++++++++++++--- tests/core/test_results.py | 37 +++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 tests/core/test_results.py diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 253ccedabc5d79..172930fd4ad9a8 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -1,7 +1,9 @@ +import numbers from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any from torch import Tensor import torch from copy import copy +from pytorch_lightning.metrics.converters import _sync_ddp_if_available class Result(Dict): @@ -89,11 +91,18 @@ def log( on_epoch: bool = True, reduce_fx: Callable = torch.mean, enable_graph: bool = False, + sync_ddp: bool = False, + sync_ddp_op: Union[Any, str] = 'mean', + sync_ddp_group: Optional[Any] = None ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): value = value.detach() + # sync across ddp + if sync_ddp and isinstance(value, (torch.Tensor, numbers.Number)): + value = _sync_ddp_if_available(value, group=sync_ddp_group, reduce_op=sync_ddp_op) + if 'meta' not in self: self.__setitem__('meta', {}) @@ -338,6 +347,9 @@ def log( on_epoch: bool = False, reduce_fx: Callable = torch.mean, enable_graph: bool = False, + sync_ddp: bool = False, + sync_ddp_op: Union[Any, str] = 'mean', + sync_ddp_group: Optional[Any] = None ): """ Log a key, value @@ -369,7 +381,8 @@ def log( reduce_fx: Torch.mean by default enable_graph: if True, will not auto detach the graph """ - super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) + super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph, + sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op) def log_dict( self, @@ -380,6 +393,9 @@ def log_dict( on_epoch: bool = True, reduce_fx: Callable = torch.mean, enable_graph: bool = False, + sync_ddp: bool = False, + sync_ddp_op: Union[Any, str] = 'mean', + sync_ddp_group: Optional[Any] = None ): """ Log a dictonary of values at once @@ -399,7 +415,8 @@ def log_dict( enable_graph: """ for k, v in dictionary.items(): - self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) + self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph, + sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op) class EvalResult(Result): @@ -446,6 +463,9 @@ def log( on_epoch: bool = True, reduce_fx: Callable = torch.mean, enable_graph: bool = False, + sync_ddp: bool = False, + sync_ddp_op: Union[Any, str] = 'mean', + sync_ddp_group: Optional[Any] = None ): """ Log a key, value @@ -476,7 +496,8 @@ def log( reduce_fx: Torch.mean by default enable_graph: if True, will not auto detach the graph : """ - super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) + super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph, + sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op) def log_dict( self, @@ -487,6 +508,9 @@ def log_dict( on_epoch: bool = True, reduce_fx: Callable = torch.mean, enable_graph: bool = False, + sync_ddp: bool = False, + sync_ddp_op: Union[Any, str] = 'mean', + sync_ddp_group: Optional[Any] = None ): """ Log a dictonary of values at once @@ -506,7 +530,8 @@ def log_dict( enable_graph: """ for k, v in dictionary.items(): - self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) + self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph, + sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op) def get_callback_metrics(self) -> dict: result = { diff --git a/tests/core/test_results.py b/tests/core/test_results.py new file mode 100644 index 00000000000000..743a6d89153436 --- /dev/null +++ b/tests/core/test_results.py @@ -0,0 +1,37 @@ +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult +import tests.base.develop_utils as tutils +import sys + + +def _setup_ddp(rank, worldsize): + import os + + os.environ["MASTER_ADDR"] = "localhost" + + # initialize the process group + dist.init_process_group("gloo", rank=rank, world_size=worldsize) + + +def _ddp_test_fn(rank, worldsize, result_cls: Result): + _setup_ddp(rank, worldsize) + tensor = torch.tensor([1.0]) + + res = result_cls() + res.log("test_tensor", tensor, sync_ddp=True, sync_ddp_op=torch.distributed.ReduceOp.SUM) + + assert res["test_tensor"].item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors" + + +@pytest.mark.parametrize("result_cls", [Result, TrainResult, EvalResult]) +@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows") +def test_result_reduce_ddp(result_cls): + """Make sure result logging works with DDP""" + tutils.reset_seed() + tutils.set_random_master_port() + + worldsize = 2 + mp.spawn(_ddp_test_fn, args=(worldsize, result_cls), nprocs=worldsize) From dd78be516aafe89890065ecc4c24b0303dba2712 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 5 Aug 2020 20:45:11 -0400 Subject: [PATCH 14/39] Update __init__.py --- pytorch_lightning/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index f3bde1c7ee5738..3a1d0fa8474580 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,6 +1,6 @@ """Root package info.""" -__version__ = '0.9.0rc8' +__version__ = '0.9.0rc9' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' From ac4a21507105e2c29c114e93d3cc49485ef34de2 Mon Sep 17 00:00:00 2001 From: Younghun Roh <9127047+Diuven@users.noreply.github.com> Date: Thu, 6 Aug 2020 18:40:35 +0900 Subject: [PATCH 15/39] Faster Accuracy metric (#2775) * Faster classfication stats * Faster accuracy metric * minor change on cls metric * Add out-of-bound class clamping * Add more tests and minor fixes * Resolve code style warning * Update for #2781 * hotfix * Update pytorch_lightning/metrics/functional/classification.py Co-authored-by: Jirka Borovec * Update about conversation * Add docstring on stat_scores_multiple_classes Co-authored-by: Younghun Roh Co-authored-by: Jirka Borovec --- .../metrics/functional/classification.py | 81 ++++++++++++++----- .../metrics/functional/test_classification.py | 16 ++-- 2 files changed, 73 insertions(+), 24 deletions(-) diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py index 0ed308dff87aac..d12509d5885299 100644 --- a/pytorch_lightning/metrics/functional/classification.py +++ b/pytorch_lightning/metrics/functional/classification.py @@ -138,10 +138,10 @@ def stat_scores_multiple_classes( target: torch.Tensor, num_classes: Optional[int] = None, argmax_dim: int = 1, + reduction: str = 'none', ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ - Calls the stat_scores function iteratively for all classes, thus - calculating the number of true postive, false postive, true negative + Calculates the number of true postive, false postive, true negative and false negative for each class Args: @@ -150,6 +150,12 @@ def stat_scores_multiple_classes( num_classes: number of classes if known argmax_dim: if pred is a tensor of probabilities, this indicates the axis the argmax transformation will be applied over + reduction: method for reducing result values (default: none) + Available reduction methods: + + - elementwise_mean: takes the mean + - none: pass array + - sum: add elements Return: True Positive, False Positive, True Negative, False Negative, Support @@ -173,16 +179,58 @@ def stat_scores_multiple_classes( if pred.ndim == target.ndim + 1: pred = to_categorical(pred, argmax_dim=argmax_dim) - num_classes = get_num_classes(pred=pred, target=target, - num_classes=num_classes) + num_classes = get_num_classes(pred=pred, target=target, num_classes=num_classes) - tps = torch.zeros((num_classes,), device=pred.device) - fps = torch.zeros((num_classes,), device=pred.device) - tns = torch.zeros((num_classes,), device=pred.device) - fns = torch.zeros((num_classes,), device=pred.device) - sups = torch.zeros((num_classes,), device=pred.device) - for c in range(num_classes): - tps[c], fps[c], tns[c], fns[c], sups[c] = stat_scores(pred=pred, target=target, class_index=c) + if pred.dtype != torch.bool: + pred.clamp_max_(max=num_classes) + if target.dtype != torch.bool: + target.clamp_max_(max=num_classes) + + possible_reductions = ('none', 'sum', 'elementwise_mean') + if reduction not in possible_reductions: + raise ValueError("reduction type %s not supported" % reduction) + + if reduction == 'none': + pred = pred.view((-1, )).long() + target = target.view((-1, )).long() + + tps = torch.zeros((num_classes + 1,), device=pred.device) + fps = torch.zeros((num_classes + 1,), device=pred.device) + tns = torch.zeros((num_classes + 1,), device=pred.device) + fns = torch.zeros((num_classes + 1,), device=pred.device) + sups = torch.zeros((num_classes + 1,), device=pred.device) + + match_true = (pred == target).float() + match_false = 1 - match_true + + tps.scatter_add_(0, pred, match_true) + fps.scatter_add_(0, pred, match_false) + fns.scatter_add_(0, target, match_false) + tns = pred.size(0) - (tps + fps + fns) + sups.scatter_add_(0, target, torch.ones_like(match_true)) + + tps = tps[:num_classes] + fps = fps[:num_classes] + tns = tns[:num_classes] + fns = fns[:num_classes] + sups = sups[:num_classes] + + elif reduction == 'sum' or reduction == 'elementwise_mean': + count_match_true = (pred == target).sum().float() + oob_tp, oob_fp, oob_tn, oob_fn, oob_sup = stat_scores(pred, target, num_classes, argmax_dim) + + tps = count_match_true - oob_tp + fps = pred.nelement() - count_match_true - oob_fp + fns = pred.nelement() - count_match_true - oob_fn + tns = pred.nelement() * (num_classes + 1) - (tps + fps + fns + oob_tn) + sups = pred.nelement() - oob_sup.float() + + if reduction == 'elementwise_mean': + tps /= num_classes + fps /= num_classes + fns /= num_classes + tns /= num_classes + sups /= num_classes return tps, fps, tns, fns, sups @@ -218,16 +266,13 @@ def accuracy( tensor(0.7500) """ - tps, fps, tns, fns, sups = stat_scores_multiple_classes( - pred=pred, target=target, num_classes=num_classes) - if not (target > 0).any() and num_classes is None: raise RuntimeError("cannot infer num_classes when target is all zero") - if reduction in ('elementwise_mean', 'sum'): - return reduce(sum(tps) / sum(sups), reduction=reduction) - if reduction == 'none': - return reduce(tps / sups, reduction=reduction) + tps, fps, tns, fns, sups = stat_scores_multiple_classes( + pred=pred, target=target, num_classes=num_classes, reduction=reduction) + + return tps / sups def confusion_matrix( diff --git a/tests/metrics/functional/test_classification.py b/tests/metrics/functional/test_classification.py index c9e1f0892f6e7f..bc2c5cb34354af 100644 --- a/tests/metrics/functional/test_classification.py +++ b/tests/metrics/functional/test_classification.py @@ -121,15 +121,19 @@ def test_stat_scores(pred, target, expected_tp, expected_fp, expected_tn, expect assert sup.item() == expected_support -@pytest.mark.parametrize(['pred', 'target', 'expected_tp', 'expected_fp', +@pytest.mark.parametrize(['pred', 'target', 'reduction', 'expected_tp', 'expected_fp', 'expected_tn', 'expected_fn', 'expected_support'], [ - pytest.param(torch.tensor([0., 2., 4., 4.]), torch.tensor([0., 4., 3., 4.]), + pytest.param(torch.tensor([0., 2., 4., 4.]), torch.tensor([0., 4., 3., 4.]), 'none', + [1, 0, 0, 0, 1], [0, 0, 1, 0, 1], [3, 4, 3, 3, 1], [0, 0, 0, 1, 1], [1, 0, 0, 1, 2]), + pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), 'none', [1, 0, 0, 0, 1], [0, 0, 1, 0, 1], [3, 4, 3, 3, 1], [0, 0, 0, 1, 1], [1, 0, 0, 1, 2]), - pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), - [1, 0, 0, 0, 1], [0, 0, 1, 0, 1], [3, 4, 3, 3, 1], [0, 0, 0, 1, 1], [1, 0, 0, 1, 2]) + pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), 'sum', + torch.tensor(2), torch.tensor(2), torch.tensor(14), torch.tensor(2), torch.tensor(4)), + pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), 'elementwise_mean', + torch.tensor(0.4), torch.tensor(0.4), torch.tensor(2.8), torch.tensor(0.4), torch.tensor(0.8)) ]) -def test_stat_scores_multiclass(pred, target, expected_tp, expected_fp, expected_tn, expected_fn, expected_support): - tp, fp, tn, fn, sup = stat_scores_multiple_classes(pred, target) +def test_stat_scores_multiclass(pred, target, reduction, expected_tp, expected_fp, expected_tn, expected_fn, expected_support): + tp, fp, tn, fn, sup = stat_scores_multiple_classes(pred, target, reduction=reduction) assert torch.allclose(torch.tensor(expected_tp).to(tp), tp) assert torch.allclose(torch.tensor(expected_fp).to(fp), fp) From 767c44950c37cb67c5864ab2b92d22da74f6608b Mon Sep 17 00:00:00 2001 From: xmotli02 <9034262+xmotli02@users.noreply.github.com> Date: Thu, 6 Aug 2020 12:08:25 +0200 Subject: [PATCH 16/39] Added basic file logger (#2721) * Added basic file logger #1803 * fixup! Added basic file logger #1803 * fixup! Added basic file logger #1803 * fixup! Added basic file logger #1803 * fixup! Added basic file logger #1803 * fixup! Added basic file logger #1803 * csv * Apply suggestions from code review * tests * tests * tests * miss * docs Co-authored-by: xmotli02 Co-authored-by: Jirka Borovec Co-authored-by: Jirka Borovec --- CHANGELOG.md | 2 + docs/source/loggers.rst | 6 + pytorch_lightning/core/saving.py | 2 +- pytorch_lightning/loggers/__init__.py | 3 + pytorch_lightning/loggers/csv_logs.py | 204 ++++++++++++++++++++++++++ tests/loggers/test_all.py | 7 + tests/loggers/test_csv.py | 97 ++++++++++++ 7 files changed, 320 insertions(+), 1 deletion(-) create mode 100644 pytorch_lightning/loggers/csv_logs.py create mode 100644 tests/loggers/test_csv.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e800c28964ff8..bf8d002bce0e8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added SyncBN for DDP ([#2801](https://github.com/PyTorchLightning/pytorch-lightning/pull/2801)) +- Added basic `CSVLogger` ([#2721](https://github.com/PyTorchLightning/pytorch-lightning/pull/2721)) + - Added SSIM metrics ([#2671](https://github.com/PyTorchLightning/pytorch-lightning/pull/2671)) - Added BLEU metrics ([#2535](https://github.com/PyTorchLightning/pytorch-lightning/pull/2535)) diff --git a/docs/source/loggers.rst b/docs/source/loggers.rst index 1877e9f3eff5a8..e04ba1af5ca1cc 100644 --- a/docs/source/loggers.rst +++ b/docs/source/loggers.rst @@ -339,4 +339,10 @@ Test-tube ^^^^^^^^^ .. autoclass:: pytorch_lightning.loggers.test_tube.TestTubeLogger + :noindex: + +CSVLogger +^^^^^^^^^ + +.. autoclass:: pytorch_lightning.loggers.csv_logs.CSVLogger :noindex: \ No newline at end of file diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py index 5e3ef1d97236d7..37c63de1804ab1 100644 --- a/pytorch_lightning/core/saving.py +++ b/pytorch_lightning/core/saving.py @@ -313,7 +313,7 @@ def load_hparams_from_yaml(config_yaml: str) -> Dict[str, Any]: return {} with open(config_yaml) as fp: - tags = yaml.load(fp, Loader=yaml.SafeLoader) + tags = yaml.load(fp) return tags diff --git a/pytorch_lightning/loggers/__init__.py b/pytorch_lightning/loggers/__init__.py index daa2b99bb80c6f..5f2f3044d0a65d 100644 --- a/pytorch_lightning/loggers/__init__.py +++ b/pytorch_lightning/loggers/__init__.py @@ -2,11 +2,14 @@ from pytorch_lightning.loggers.base import LightningLoggerBase, LoggerCollection from pytorch_lightning.loggers.tensorboard import TensorBoardLogger +from pytorch_lightning.loggers.csv_logs import CSVLogger + __all__ = [ 'LightningLoggerBase', 'LoggerCollection', 'TensorBoardLogger', + 'CSVLogger', ] try: diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py new file mode 100644 index 00000000000000..1e395abadb2937 --- /dev/null +++ b/pytorch_lightning/loggers/csv_logs.py @@ -0,0 +1,204 @@ +""" +CSV logger +---------- + +CSV logger for basic experiment logging that does not require opening ports + +""" +import io +import os +import csv +import torch +from argparse import Namespace +from typing import Optional, Dict, Any, Union + +from pytorch_lightning import _logger as log +from pytorch_lightning.core.saving import save_hparams_to_yaml +from pytorch_lightning.loggers.base import LightningLoggerBase +from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_only + + +class ExperimentWriter(object): + r""" + Experiment writer for CSVLogger. + + Currently supports to log hyperparameters and metrics in YAML and CSV + format, respectively. + + Args: + log_dir: Directory for the experiment logs + """ + + NAME_HPARAMS_FILE = 'hparams.yaml' + NAME_METRICS_FILE = 'metrics.csv' + + def __init__(self, log_dir: str) -> None: + self.hparams = {} + self.metrics = [] + + self.log_dir = log_dir + if os.path.exists(self.log_dir): + rank_zero_warn( + f"Experiment logs directory {self.log_dir} exists and is not empty." + " Previous log files in this directory will be deleted when the new ones are saved!" + ) + os.makedirs(self.log_dir, exist_ok=True) + + self.metrics_file_path = os.path.join(self.log_dir, self.NAME_METRICS_FILE) + + def log_hparams(self, params: Dict[str, Any]) -> None: + """Record hparams""" + self.hparams.update(params) + + def log_metrics(self, metrics_dict: Dict[str, float], step: Optional[int] = None) -> None: + """Record metrics""" + def _handle_value(value): + if isinstance(value, torch.Tensor): + return value.item() + return value + + if step is None: + step = len(self.metrics) + + metrics = {k: _handle_value(v) for k, v in metrics_dict.items()} + metrics['step'] = step + self.metrics.append(metrics) + + def save(self) -> None: + """Save recorded hparams and metrics into files""" + hparams_file = os.path.join(self.log_dir, self.NAME_HPARAMS_FILE) + save_hparams_to_yaml(hparams_file, self.hparams) + + if not self.metrics: + return + + last_m = {} + for m in self.metrics: + last_m.update(m) + metrics_keys = list(last_m.keys()) + + with io.open(self.metrics_file_path, 'w', newline='') as f: + self.writer = csv.DictWriter(f, fieldnames=metrics_keys) + self.writer.writeheader() + self.writer.writerows(self.metrics) + + +class CSVLogger(LightningLoggerBase): + r""" + Log to local file system in yaml and CSV format. Logs are saved to + ``os.path.join(save_dir, name, version)``. + + Example: + >>> from pytorch_lightning import Trainer + >>> from pytorch_lightning.loggers import CSVLogger + >>> logger = CSVLogger("logs", name="my_exp_name") + >>> trainer = Trainer(logger=logger) + + Args: + save_dir: Save directory + name: Experiment name. Defaults to ``'default'``. + version: Experiment version. If version is not specified the logger inspects the save + directory for existing versions, then automatically assigns the next available version. + """ + + def __init__(self, + save_dir: str, + name: Optional[str] = "default", + version: Optional[Union[int, str]] = None): + + super().__init__() + self._save_dir = save_dir + self._name = name or '' + self._version = version + self._experiment = None + + @property + def root_dir(self) -> str: + """ + Parent directory for all checkpoint subdirectories. + If the experiment name parameter is ``None`` or the empty string, no experiment subdirectory is used + and the checkpoint will be saved in "save_dir/version_dir" + """ + if not self.name: + return self.save_dir + return os.path.join(self.save_dir, self.name) + + @property + def log_dir(self) -> str: + """ + The log directory for this run. By default, it is named + ``'version_${self.version}'`` but it can be overridden by passing a string value + for the constructor's version parameter instead of ``None`` or an int. + """ + # create a pseudo standard path ala test-tube + version = self.version if isinstance(self.version, str) else f"version_{self.version}" + log_dir = os.path.join(self.root_dir, version) + return log_dir + + @property + def save_dir(self) -> Optional[str]: + return self._save_dir + + @property + def experiment(self) -> ExperimentWriter: + r""" + + Actual ExperimentWriter object. To use ExperimentWriter features in your + :class:`~pytorch_lightning.core.lightning.LightningModule` do the following. + + Example:: + + self.logger.experiment.some_experiment_writer_function() + + """ + if self._experiment: + return self._experiment + + os.makedirs(self.root_dir, exist_ok=True) + self._experiment = ExperimentWriter(log_dir=self.log_dir) + return self._experiment + + @rank_zero_only + def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: + params = self._convert_params(params) + self.experiment.log_hparams(params) + + @rank_zero_only + def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + self.experiment.log_metrics(metrics, step) + + @rank_zero_only + def save(self) -> None: + super().save() + self.experiment.save() + + @rank_zero_only + def finalize(self, status: str) -> None: + self.save() + + @property + def name(self) -> str: + return self._name + + @property + def version(self) -> int: + if self._version is None: + self._version = self._get_next_version() + return self._version + + def _get_next_version(self): + root_dir = os.path.join(self._save_dir, self.name) + + if not os.path.isdir(root_dir): + log.warning('Missing logger folder: %s', root_dir) + return 0 + + existing_versions = [] + for d in os.listdir(root_dir): + if os.path.isdir(os.path.join(root_dir, d)) and d.startswith("version_"): + existing_versions.append(int(d.split("_")[1])) + + if len(existing_versions) == 0: + return 0 + + return max(existing_versions) + 1 diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py index 5bd81d7116948d..7978aa8e41acec 100644 --- a/tests/loggers/test_all.py +++ b/tests/loggers/test_all.py @@ -5,11 +5,13 @@ import platform from unittest import mock +import cloudpickle import pytest import tests.base.develop_utils as tutils from pytorch_lightning import Trainer, Callback from pytorch_lightning.loggers import ( + CSVLogger, TensorBoardLogger, MLFlowLogger, NeptuneLogger, @@ -34,6 +36,7 @@ def _get_logger_args(logger_class, save_dir): @pytest.mark.parametrize("logger_class", [ TensorBoardLogger, + CSVLogger, CometLogger, MLFlowLogger, NeptuneLogger, @@ -85,6 +88,7 @@ def log_metrics(self, metrics, step): @pytest.mark.parametrize("logger_class", [ + CSVLogger, TensorBoardLogger, CometLogger, MLFlowLogger, @@ -148,6 +152,7 @@ def name(self): @pytest.mark.parametrize("logger_class", [ TensorBoardLogger, + CSVLogger, CometLogger, MLFlowLogger, NeptuneLogger, @@ -170,6 +175,7 @@ def test_loggers_pickle(tmpdir, monkeypatch, logger_class): # test pickling loggers pickle.dumps(logger) + cloudpickle.dumps(logger) trainer = Trainer( max_epochs=1, @@ -226,6 +232,7 @@ def on_train_batch_start(self, trainer, pl_module): @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows") @pytest.mark.parametrize("logger_class", [ TensorBoardLogger, + # CSVLogger, # todo CometLogger, MLFlowLogger, NeptuneLogger, diff --git a/tests/loggers/test_csv.py b/tests/loggers/test_csv.py new file mode 100644 index 00000000000000..3bc8330075e6a1 --- /dev/null +++ b/tests/loggers/test_csv.py @@ -0,0 +1,97 @@ +from argparse import Namespace + +import pytest +import torch +import os + +from pytorch_lightning.core.saving import load_hparams_from_yaml +from pytorch_lightning.loggers import CSVLogger +from pytorch_lightning.loggers.csv_logs import ExperimentWriter + + +def test_file_logger_automatic_versioning(tmpdir): + """Verify that automatic versioning works""" + + root_dir = tmpdir.mkdir("exp") + root_dir.mkdir("version_0") + root_dir.mkdir("version_1") + + logger = CSVLogger(save_dir=tmpdir, name="exp") + + assert logger.version == 2 + + +def test_file_logger_manual_versioning(tmpdir): + """Verify that manual versioning works""" + + root_dir = tmpdir.mkdir("exp") + root_dir.mkdir("version_0") + root_dir.mkdir("version_1") + root_dir.mkdir("version_2") + + logger = CSVLogger(save_dir=tmpdir, name="exp", version=1) + + assert logger.version == 1 + + +def test_file_logger_named_version(tmpdir): + """Verify that manual versioning works for string versions, e.g. '2020-02-05-162402' """ + + exp_name = "exp" + tmpdir.mkdir(exp_name) + expected_version = "2020-02-05-162402" + + logger = CSVLogger(save_dir=tmpdir, name=exp_name, version=expected_version) + logger.log_hyperparams({"a": 1, "b": 2}) + logger.save() + assert logger.version == expected_version + assert os.listdir(tmpdir / exp_name) == [expected_version] + assert os.listdir(tmpdir / exp_name / expected_version) + + +@pytest.mark.parametrize("name", ['', None]) +def test_file_logger_no_name(tmpdir, name): + """Verify that None or empty name works""" + logger = CSVLogger(save_dir=tmpdir, name=name) + logger.save() + assert logger.root_dir == tmpdir + assert os.listdir(tmpdir / 'version_0') + + +@pytest.mark.parametrize("step_idx", [10, None]) +def test_file_logger_log_metrics(tmpdir, step_idx): + logger = CSVLogger(tmpdir) + metrics = { + "float": 0.3, + "int": 1, + "FloatTensor": torch.tensor(0.1), + "IntTensor": torch.tensor(1) + } + logger.log_metrics(metrics, step_idx) + logger.save() + + path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) + with open(path_csv, 'r') as fp: + lines = fp.readlines() + assert len(lines) == 2 + assert all([n in lines[0] for n in metrics]) + + +def test_file_logger_log_hyperparams(tmpdir): + logger = CSVLogger(tmpdir) + hparams = { + "float": 0.3, + "int": 1, + "string": "abc", + "bool": True, + "dict": {'a': {'b': 'c'}}, + "list": [1, 2, 3], + "namespace": Namespace(foo=Namespace(bar='buzz')), + "layer": torch.nn.BatchNorm1d + } + logger.log_hyperparams(hparams) + logger.save() + + path_yaml = os.path.join(logger.log_dir, ExperimentWriter.NAME_HPARAMS_FILE) + params = load_hparams_from_yaml(path_yaml) + assert all([n in params for n in hparams]) From 9b997c8616d5e94446fe48eee20daa3b9f93b424 Mon Sep 17 00:00:00 2001 From: s-rog <55400948+s-rog@users.noreply.github.com> Date: Thu, 6 Aug 2020 19:11:43 +0800 Subject: [PATCH 17/39] add test for none checkpoint in ddp_spawn (#2845) * add test for none checkpoint in ddp_spawn * fix code style * make sure checkpoint_callback is none * Fix tests Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- tests/trainer/test_trainer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 3dbb7b7c079d64..d6641c2f7ab249 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -988,3 +988,17 @@ def setup(self, stage): trainer.test(ckpt_path=None) assert trainer.stage == 'test' assert trainer.get_model().stage == 'test' + + +def test_trainer_ddp_spawn_none_checkpoint(tmpdir): + model = EvalModelTemplate() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + checkpoint_callback=None, + distributed_backend="ddp_spawn" + ) + assert trainer.checkpoint_callback is None + result = trainer.fit(model) + assert trainer.checkpoint_callback is None + assert result == 1 From 9ab071588bbe0e24441ae0f07a271bded0e4d9c3 Mon Sep 17 00:00:00 2001 From: Nathan Raw Date: Thu, 6 Aug 2020 05:12:47 -0600 Subject: [PATCH 18/39] Setup extras (#2831) * :art: use package extras * :art: get extras from reqs * :art: . * :pencil: docs * :art: . --- .github/CONTRIBUTING.md | 5 ++--- setup.py | 26 +++++++++++++++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index d39b0bd112c77a..9e1faafc63ee3f 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -137,7 +137,7 @@ formatting errors. In certain cases, a missing blank line or a wrong indent can Run these commands ```bash -pip install -r requirements/docs.txt +pip install ".[docs]" cd docs make html ``` @@ -159,8 +159,7 @@ Testing your work locally will help you speed up the process since it allows you To setup a local development environment, install both local and test dependencies: ```bash -python -m pip install -r requirements/devel.txt -python -m pip install -r requirements/examples.txt +python -m pip install ".[dev, examples]" python -m pip install pre-commit ``` diff --git a/setup.py b/setup.py index 0246b3cd5d3d9f..ded68282faa43b 100755 --- a/setup.py +++ b/setup.py @@ -12,21 +12,23 @@ # https://packaging.python.org/guides/single-sourcing-package-version/ # http://blog.ionelmc.ro/2014/05/25/python-packaging/ - PATH_ROOT = os.path.dirname(__file__) builtins.__LIGHTNING_SETUP__ = True import pytorch_lightning # noqa: E402 -def load_requirements(path_dir=PATH_ROOT, comment_char='#'): - with open(os.path.join(path_dir, 'requirements', 'base.txt'), 'r') as file: +def load_requirements(path_dir=PATH_ROOT, file_name='base.txt', comment_char='#'): + with open(os.path.join(path_dir, 'requirements', file_name), 'r') as file: lines = [ln.strip() for ln in file.readlines()] reqs = [] for ln in lines: # filer all comments if comment_char in ln: - ln = ln[:ln.index(comment_char)] + ln = ln[:ln.index(comment_char)].strip() + # Make slight syntax alteration to git dependency for PL's sphinx theme + if ln.startswith('git') and file_name == 'docs.txt': + ln = f'pt_lightning_sphinx_theme @ {ln}#egg=pt-lightning-sphinx-theme' if ln: # if requirement is not empty reqs.append(ln) return reqs @@ -43,6 +45,19 @@ def load_long_description(): return text +# https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras +# Define package extras. These are only installed if you specify them. +# From remote, use like `pip install pytorch-lightning[dev, docs]` +# From local copy of repo, use like `pip install ".[dev, docs]"` +extras = { + 'docs': load_requirements(file_name='docs.txt'), + 'examples': load_requirements(file_name='examples.txt'), + 'extra': load_requirements(file_name='extra.txt'), + 'test': load_requirements(file_name='test.txt') +} +extras['dev'] = extras['extra'] + extras['test'] +extras['all'] = extras['dev'] + extras['examples'] + extras['docs'] + # https://packaging.python.org/discussions/install-requires-vs-requirements / # keep the meta-data here for simplicity in reading this file... it's not obvious # what happens and to non-engineers they won't know to look in init ... @@ -67,7 +82,8 @@ def load_long_description(): keywords=['deep learning', 'pytorch', 'AI'], python_requires='>=3.6', setup_requires=[], - install_requires=load_requirements(PATH_ROOT), + install_requires=load_requirements(), + extras_require=extras, project_urls={ "Bug Tracker": "https://github.com/PyTorchLightning/pytorch-lightning/issues", From ed3ee982b33db395a1fb41865d8ca3be39379b7e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 6 Aug 2020 16:58:51 +0200 Subject: [PATCH 19/39] clean tests imports (#2834) --- tests/base/datasets.py | 2 +- tests/base/deterministic_model.py | 3 +-- tests/base/develop_utils.py | 2 +- tests/base/model_train_steps.py | 3 ++- tests/callbacks/test_early_stopping.py | 2 +- tests/core/test_datamodules.py | 2 +- tests/loggers/test_tensorboard.py | 1 - tests/metrics/functional/test_regression.py | 2 +- tests/metrics/test_converters.py | 3 ++- tests/metrics/test_regression.py | 1 - tests/models/test_amp.py | 4 ++-- tests/models/test_gpu.py | 3 ++- tests/models/test_grad_norm.py | 3 ++- tests/models/test_horovod.py | 2 -- tests/models/{test_onnx_save.py => test_onnx.py} | 3 ++- tests/trainer/test_config_validator.py | 2 +- tests/trainer/test_trainer_steps_result_return.py | 8 +++++--- tests/trainer/test_trainer_steps_scalar_return.py | 3 ++- tests/trainer/test_validation_steps_result_return.py | 7 ++++--- 19 files changed, 30 insertions(+), 26 deletions(-) rename tests/models/{test_onnx_save.py => test_onnx.py} (99%) diff --git a/tests/base/datasets.py b/tests/base/datasets.py index 27e614eee68cfc..5bc6048a43c65f 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -1,8 +1,8 @@ import logging import os import random -import urllib.request import time +import urllib.request from typing import Tuple, Optional, Sequence import torch diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 75f5cad3b39cf9..676720e789e89a 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -1,9 +1,8 @@ -import numpy as np import torch from torch import nn from torch.utils.data import Dataset, DataLoader -from pytorch_lightning import TrainResult, EvalResult +from pytorch_lightning import TrainResult, EvalResult from pytorch_lightning.core.lightning import LightningModule diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index ada745951494cc..37fde1d8723c08 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -1,3 +1,4 @@ +import functools import os import numpy as np @@ -8,7 +9,6 @@ from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger from tests import TEMP_PATH, RANDOM_PORTS, RANDOM_SEEDS from tests.base.model_template import EvalModelTemplate -import functools def assert_speed_parity_relative(pl_times, pt_times, max_diff: float = 0.1): diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 189e496564da1d..c361c3692ad17c 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -1,10 +1,11 @@ import math from abc import ABC from collections import OrderedDict -from pytorch_lightning import TrainResult, EvalResult import torch +from pytorch_lightning import TrainResult, EvalResult + class TrainingStepVariations(ABC): """ diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py index 17ca3bb2210f33..eb5a400655b20b 100644 --- a/tests/callbacks/test_early_stopping.py +++ b/tests/callbacks/test_early_stopping.py @@ -2,8 +2,8 @@ import cloudpickle import pytest - import torch + from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from tests.base import EvalModelTemplate diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py index 305f7f3d69150e..fd4d3c082e0be9 100644 --- a/tests/core/test_datamodules.py +++ b/tests/core/test_datamodules.py @@ -1,8 +1,8 @@ import pickle from argparse import ArgumentParser -import torch import pytest +import torch from pytorch_lightning import Trainer from tests.base import EvalModelTemplate diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index 21c58084e2df40..e5aec716a25071 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -5,7 +5,6 @@ import torch import yaml from packaging import version -from tensorboard.backend.event_processing.event_accumulator import EventAccumulator from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger diff --git a/tests/metrics/functional/test_regression.py b/tests/metrics/functional/test_regression.py index cd251c77a98fc1..628e37ec78e955 100644 --- a/tests/metrics/functional/test_regression.py +++ b/tests/metrics/functional/test_regression.py @@ -1,6 +1,6 @@ +import numpy as np import pytest import torch -import numpy as np from skimage.metrics import peak_signal_noise_ratio as ski_psnr from skimage.metrics import structural_similarity as ski_ssim diff --git a/tests/metrics/test_converters.py b/tests/metrics/test_converters.py index 60eb8cc48ec7a0..1d1412dd8171a0 100644 --- a/tests/metrics/test_converters.py +++ b/tests/metrics/test_converters.py @@ -1,6 +1,7 @@ +import sys + import numpy as np import pytest -import sys import torch import torch.distributed as dist import torch.multiprocessing as mp diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py index e5ecd51c775b90..36c408e93c4695 100644 --- a/tests/metrics/test_regression.py +++ b/tests/metrics/test_regression.py @@ -3,7 +3,6 @@ # Especially reduction and reducing across processes won't be tested here! import torch -from skimage.metrics import peak_signal_noise_ratio as ski_psnr from pytorch_lightning.metrics.regression import ( MAE, MSE, RMSE, RMSLE, PSNR, SSIM diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py index 5f734fb3bdcf60..7f729c0c1fa5d4 100644 --- a/tests/models/test_amp.py +++ b/tests/models/test_amp.py @@ -1,15 +1,15 @@ import os +from unittest.mock import MagicMock import pytest import torch +import wandb import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -import wandb -from unittest.mock import MagicMock @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6752e559632cba..7497a53083612e 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -2,6 +2,7 @@ import pytest import torch +from torchtext.data import Batch, Dataset, Example, Field, LabelField import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils @@ -10,7 +11,7 @@ from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -from torchtext.data import Batch, Dataset, Example, Field, LabelField + PRETEND_N_OF_GPUS = 16 diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index dc7eee557d4484..2e0d4454500f39 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -1,6 +1,7 @@ +import os + import numpy as np import pytest -import os from pytorch_lightning import Trainer from tests.base import EvalModelTemplate diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 05aa2d7f29bc3b..f48db196c104aa 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -4,10 +4,8 @@ import shlex import subprocess import sys - from unittest.mock import patch -import numpy as np import pytest import torch diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx.py similarity index 99% rename from tests/models/test_onnx_save.py rename to tests/models/test_onnx.py index f824f33c93bc14..d7cc7cffaec3f7 100644 --- a/tests/models/test_onnx_save.py +++ b/tests/models/test_onnx.py @@ -1,9 +1,10 @@ import os +import numpy as np import onnxruntime import pytest import torch -import numpy as np + import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer diff --git a/tests/trainer/test_config_validator.py b/tests/trainer/test_config_validator.py index 4b31c7d09dba35..8db122037b51c9 100755 --- a/tests/trainer/test_config_validator.py +++ b/tests/trainer/test_config_validator.py @@ -1,7 +1,7 @@ import pytest import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, LightningModule +from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index af1f582bdf7097..1785fea3c0afee 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -2,12 +2,14 @@ Tests to ensure that the training loop works with a dict """ import os + +import pytest import torch + from pytorch_lightning import Trainer -from tests.base.deterministic_model import DeterministicModel -from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult +from pytorch_lightning.core.step_result import TrainResult from tests.base import EvalModelTemplate -import pytest +from tests.base.deterministic_model import DeterministicModel # test with train_step_end diff --git a/tests/trainer/test_trainer_steps_scalar_return.py b/tests/trainer/test_trainer_steps_scalar_return.py index b893b58310dc37..e5eb1e9bccf5f1 100644 --- a/tests/trainer/test_trainer_steps_scalar_return.py +++ b/tests/trainer/test_trainer_steps_scalar_return.py @@ -1,9 +1,10 @@ """ Tests to ensure that the training loop works with a scalar """ +import torch + from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel -import torch def test_training_step_scalar(tmpdir): diff --git a/tests/trainer/test_validation_steps_result_return.py b/tests/trainer/test_validation_steps_result_return.py index 118e420adbd8ca..8162f57287e71e 100644 --- a/tests/trainer/test_validation_steps_result_return.py +++ b/tests/trainer/test_validation_steps_result_return.py @@ -2,12 +2,13 @@ Tests to ensure that the training loop works with a dict """ import os + +import pytest import torch + from pytorch_lightning import Trainer -from tests.base.deterministic_model import DeterministicModel -from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult from tests.base import EvalModelTemplate -import pytest +from tests.base.deterministic_model import DeterministicModel # test with train_step_end From a829f15f8c9e2a3fdcdea3c59f54ba2879842622 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 16:32:41 +0530 Subject: [PATCH 20/39] Support limit_mode_batches(int) for infinite dataloader --- pytorch_lightning/trainer/data_loading.py | 39 +++++++++++------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 09186765c6eeec..143a132f12f98a 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -212,18 +212,20 @@ def reset_train_dataloader(self, model: LightningModule) -> None: # automatically add samplers self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True) + self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf') self._worker_check(self.train_dataloader, 'train dataloader') self._check_batch_limits('limit_train_batches') - if not _has_len(self.train_dataloader): - self.num_training_batches = float('inf') + if isinstance(self.limit_train_batches, int): + self.num_training_batches = min(self.num_training_batches, self.limit_train_batches) else: - # try getting the length - if isinstance(self.limit_train_batches, float): - self.num_training_batches = len(self.train_dataloader) + if self.num_training_batches != float('inf'): self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) - else: - self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches) + elif self.limit_train_batches not in (0.0, 1.0): + raise MisconfigurationException( + 'When using an infinite DataLoader (e.g. with an IterableDataset' + f' or when DataLoader does not implement `__len__`) for `limit_train_batches`,' + f' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`') # determine when to check validation # if int passed in, val checks that often @@ -308,20 +310,17 @@ def _reset_eval_dataloader( # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') - if num_batches != float('inf'): - self._check_batch_limits(f'limit_{mode}_batches') - - # limit num batches either as a percent or num steps - if isinstance(limit_eval_batches, float): + # limit num batches either as a percent or num steps + if isinstance(limit_eval_batches, int): + num_batches = min(num_batches, limit_eval_batches) + else: + if num_batches != float('inf'): num_batches = int(num_batches * limit_eval_batches) - else: - num_batches = min(len(dataloader), limit_eval_batches) - - elif limit_eval_batches not in (0.0, 1.0): - raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset' - f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,' - f' `Trainer(limit_{mode}_batches)` must be `0.0` or `1.0`.') + elif limit_eval_batches not in (0.0, 1.0): + raise MisconfigurationException( + 'When using an infinite DataLoader (e.g. with an IterableDataset' + f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,' + f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or `int`') if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) From cbddd35104d561bee7617dfc391ef0e92a7b5ebc Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Sat, 1 Aug 2020 18:10:01 +0530 Subject: [PATCH 21/39] flake8 --- pytorch_lightning/trainer/data_loading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 143a132f12f98a..011ac1e4452ad7 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -224,8 +224,8 @@ def reset_train_dataloader(self, model: LightningModule) -> None: elif self.limit_train_batches not in (0.0, 1.0): raise MisconfigurationException( 'When using an infinite DataLoader (e.g. with an IterableDataset' - f' or when DataLoader does not implement `__len__`) for `limit_train_batches`,' - f' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`') + ' or when DataLoader does not implement `__len__`) for `limit_train_batches`,' + ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`') # determine when to check validation # if int passed in, val checks that often From 486dbc6cd2f049f6fdf59cb0cf73d5bd27ba30a9 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 3 Aug 2020 02:05:10 +0530 Subject: [PATCH 22/39] revert and update --- pytorch_lightning/trainer/data_loading.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 011ac1e4452ad7..237a421fc83319 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -216,16 +216,16 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self._worker_check(self.train_dataloader, 'train dataloader') self._check_batch_limits('limit_train_batches') - if isinstance(self.limit_train_batches, int): - self.num_training_batches = min(self.num_training_batches, self.limit_train_batches) + if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: + self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) else: if self.num_training_batches != float('inf'): self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) elif self.limit_train_batches not in (0.0, 1.0): raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset' - ' or when DataLoader does not implement `__len__`) for `limit_train_batches`,' - ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`') + 'When using an IterableDataset for `limit_train_batches`,' + ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' + ' num_training_batches to use.') # determine when to check validation # if int passed in, val checks that often @@ -243,8 +243,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.val_check_batch = float('inf') else: raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset' - ' or when DataLoader does not implement `__len__`) for `train_dataloader`,' + 'When using an IterableDataset for `train_dataloader`,' ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies' ' checking validation every k training batches.') else: @@ -311,16 +310,16 @@ def _reset_eval_dataloader( limit_eval_batches = getattr(self, f'limit_{mode}_batches') # limit num batches either as a percent or num steps - if isinstance(limit_eval_batches, int): - num_batches = min(num_batches, limit_eval_batches) + if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: + num_batches = min(num_batches, int(limit_eval_batches)) else: if num_batches != float('inf'): num_batches = int(num_batches * limit_eval_batches) elif limit_eval_batches not in (0.0, 1.0): raise MisconfigurationException( - 'When using an infinite DataLoader (e.g. with an IterableDataset' - f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,' - f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or `int`') + 'When using an IterableDataset for `limit_{mode}_batches`,' + f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' + f' num_{mode}_batches to use.') if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) From 7d99c320155dcabfb9dd7b9b9a97d1958edd4ff9 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 3 Aug 2020 02:10:36 +0530 Subject: [PATCH 23/39] add and update tests --- tests/trainer/test_dataloaders.py | 75 +++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 1c7e21b7a72bb5..c05338ab724d19 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -256,6 +256,65 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path): f'Multiple `test_dataloaders` not initiated properly, got {trainer.test_dataloaders}' +@pytest.mark.parametrize( + ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], + [ + pytest.param(0.0, 0.0, 0.0), + pytest.param(1.0, 1.0, 1.0), + ] +) +def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): + """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent""" + model = EvalModelTemplate() + model.train_dataloader = model.train_dataloader__infinite + model.val_dataloader = model.val_dataloader__infinite + model.test_dataloader = model.test_dataloader__infinite + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=limit_train_batches, + limit_val_batches=limit_val_batches, + limit_test_batches=limit_test_batches, + ) + + results = trainer.fit(model) + assert results == 1 + assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf') + assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf') + + trainer.test(ckpt_path=None) + assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf') + + +@pytest.mark.parametrize( + ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], + [pytest.param(10, 10, 10)] +) +def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): + """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number""" + model = EvalModelTemplate() + model.train_dataloader = model.train_dataloader__infinite + model.val_dataloader = model.val_dataloader__infinite + model.test_dataloader = model.test_dataloader__infinite + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=limit_train_batches, + limit_val_batches=limit_val_batches, + limit_test_batches=limit_test_batches, + ) + + results = trainer.fit(model) + assert results + assert trainer.num_training_batches == limit_train_batches + assert trainer.num_val_batches[0] == limit_val_batches + + trainer.test(ckpt_path=None) + assert trainer.num_test_batches[0] == limit_test_batches + + @pytest.mark.parametrize( ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], [ @@ -266,7 +325,7 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path): ] ) def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify num_batches for val & test dataloaders passed with batch limit in percent""" + """Verify num_batches for train, val & test dataloaders passed with batch limit in percent""" model = EvalModelTemplate() model.val_dataloader = model.val_dataloader__multiple_mixed_length model.test_dataloader = model.test_dataloader__multiple_mixed_length @@ -307,7 +366,7 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim ] ) def test_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): - """Verify num_batches for val & test dataloaders passed with batch limit as number""" + """Verify num_batches for train, val & test dataloaders passed with batch limit as number""" os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() @@ -436,7 +495,7 @@ def test_train_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -447,7 +506,7 @@ def test_val_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -458,7 +517,7 @@ def test_test_inf_dataloader_error(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.test(model) @@ -774,7 +833,7 @@ def test_train_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, val_check_interval=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -785,7 +844,7 @@ def test_val_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_val_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.fit(model) @@ -796,5 +855,5 @@ def test_test_dataloader_not_implemented_error_failed(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_test_batches=0.5) - with pytest.raises(MisconfigurationException, match='infinite DataLoader'): + with pytest.raises(MisconfigurationException, match='using an IterableDataset'): trainer.test(model) From 515d9be084caf28811a556cc12f4a8aa6e700577 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 3 Aug 2020 02:19:47 +0530 Subject: [PATCH 24/39] pep8 --- tests/trainer/test_dataloaders.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index c05338ab724d19..2df86c2ab52438 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -263,7 +263,8 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path): pytest.param(1.0, 1.0, 1.0), ] ) -def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): +def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, + limit_val_batches, limit_test_batches): """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent""" model = EvalModelTemplate() model.train_dataloader = model.train_dataloader__infinite From a5972f525fd7bcad15fde45560775525ca1f8369 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 4 Aug 2020 23:15:31 +0530 Subject: [PATCH 25/39] chlog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf8d002bce0e8b..d1e3e449ae6d33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562)) +- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader(IterableDataset) ([2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787)) + ### Changed - Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594)) From b3efefb863fd7cabb05ceb3a48d552d6ad7bb091 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 5 Aug 2020 09:37:19 +0200 Subject: [PATCH 26/39] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d1e3e449ae6d33..d0c2e39a428f34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562)) -- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader(IterableDataset) ([2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787)) +- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787)) ### Changed From e57d26e4c090a21d8d41998eed870a86f3c9ef79 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 5 Aug 2020 16:15:26 +0530 Subject: [PATCH 27/39] Add suggestions by @awaelchli --- pytorch_lightning/trainer/data_loading.py | 30 +++++++++++------------ tests/trainer/test_dataloaders.py | 5 +++- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 237a421fc83319..18f095446dc655 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -218,14 +218,13 @@ def reset_train_dataloader(self, model: LightningModule) -> None: if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) - else: - if self.num_training_batches != float('inf'): - self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) - elif self.limit_train_batches not in (0.0, 1.0): - raise MisconfigurationException( - 'When using an IterableDataset for `limit_train_batches`,' - ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' - ' num_training_batches to use.') + elif self.num_training_batches != float('inf'): + self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) + elif self.limit_train_batches not in (0.0, 1.0): + raise MisconfigurationException( + 'When using an IterableDataset for `limit_train_batches`,' + ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' + ' num_training_batches to use.') # determine when to check validation # if int passed in, val checks that often @@ -312,14 +311,13 @@ def _reset_eval_dataloader( # limit num batches either as a percent or num steps if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: num_batches = min(num_batches, int(limit_eval_batches)) - else: - if num_batches != float('inf'): - num_batches = int(num_batches * limit_eval_batches) - elif limit_eval_batches not in (0.0, 1.0): - raise MisconfigurationException( - 'When using an IterableDataset for `limit_{mode}_batches`,' - f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' - f' num_{mode}_batches to use.') + elif num_batches != float('inf'): + num_batches = int(num_batches * limit_eval_batches) + elif limit_eval_batches not in (0.0, 1.0): + raise MisconfigurationException( + 'When using an IterableDataset for `limit_{mode}_batches`,' + f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' + f' num_{mode}_batches to use.') if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 2df86c2ab52438..1aad5047855a2e 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -290,7 +290,10 @@ def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, @pytest.mark.parametrize( ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'], - [pytest.param(10, 10, 10)] + [ + pytest.param(0, 0, 0), + pytest.param(10, 10, 10), + ] ) def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number""" From a3c1f17fb02fcee36ddcb1dbbf3008719a6cadd9 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 5 Aug 2020 16:35:33 +0530 Subject: [PATCH 28/39] docs --- docs/source/sequences.rst | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index e24ee5bbca1cc9..301e4efbba6df8 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -49,8 +49,8 @@ Lightning can handle TBTT automatically via this flag. .. note:: If you need to modify how the batch is split, override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`. -.. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include - a `hiddens` arg. +.. note:: Using this feature requires updating your LightningModule's + :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg. ---------- @@ -59,10 +59,13 @@ Iterable Datasets Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural option when using sequential data. -.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or to an int - (specifying the number of training batches to run before validation) when initializing the Trainer. - This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate - the validation interval when val_check_interval is less than one. +.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or an int + (specifying the number of training batches to run before validation) when initializing the Trainer. This is + because the IterableDataset does not have a __len__ and Lightning requires this to calculate the validation + interval when val_check_interval is less than one. Similarly, you can set limit_{mode}_batches to a float or + an int. If it is set to 0.0 or 0 it will set num_{mode}_batches to 0, if it is an int it will set num_{mode}_batches + to limit_{mode}_batches, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception. + Here mode can be train/val/test. .. testcode:: @@ -87,3 +90,9 @@ option when using sequential data. # Set val_check_interval trainer = Trainer(val_check_interval=100) + + # Set limit_val_batches to 0.0 or 0 + trainer = Trainer(limit_val_batches=0.0) + + # Set limit_val_batches as an int + trainer = Trainer(limit_val_batches=100) From 71c4679e3fb15446a9ae446591ec6ca63af85cfb Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 5 Aug 2020 15:15:41 +0200 Subject: [PATCH 29/39] Apply suggestions from code review Co-authored-by: Ethan Harris --- pytorch_lightning/trainer/data_loading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 18f095446dc655..ce6a7235b2bd4c 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -220,7 +220,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) elif self.num_training_batches != float('inf'): self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) - elif self.limit_train_batches not in (0.0, 1.0): + elif self.limit_train_batches != 1.0: raise MisconfigurationException( 'When using an IterableDataset for `limit_train_batches`,' ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' @@ -313,7 +313,7 @@ def _reset_eval_dataloader( num_batches = min(num_batches, int(limit_eval_batches)) elif num_batches != float('inf'): num_batches = int(num_batches * limit_eval_batches) - elif limit_eval_batches not in (0.0, 1.0): + elif limit_eval_batches != 1.0: raise MisconfigurationException( 'When using an IterableDataset for `limit_{mode}_batches`,' f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' From c7472b8a2fa568cd3f23e051bbc941e2ccb16b43 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 5 Aug 2020 15:25:49 +0200 Subject: [PATCH 30/39] Apply suggestions from code review --- docs/source/sequences.rst | 8 ++++---- pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/trainer/data_loading.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index 301e4efbba6df8..bb438154e39091 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -61,10 +61,10 @@ option when using sequential data. .. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or an int (specifying the number of training batches to run before validation) when initializing the Trainer. This is - because the IterableDataset does not have a __len__ and Lightning requires this to calculate the validation - interval when val_check_interval is less than one. Similarly, you can set limit_{mode}_batches to a float or - an int. If it is set to 0.0 or 0 it will set num_{mode}_batches to 0, if it is an int it will set num_{mode}_batches - to limit_{mode}_batches, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception. + because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation + interval when ``val_check_interval`` is less than one. Similarly, you can set limit_{mode}_batches to a float or + an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches`` + to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception. Here mode can be train/val/test. .. testcode:: diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index f816726ddf1e17..4189c828ed266e 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg elif self.example_input_array is not None: input_data = self.example_input_array else: - raise ValueError('input_sample and example_input_array tensors are both missing.') + raise ValueError('`input_sample` and `example_input_array` tensors are both missing.') if 'example_outputs' not in kwargs: self.eval() diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index ce6a7235b2bd4c..53d624d46c0d9d 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -317,7 +317,7 @@ def _reset_eval_dataloader( raise MisconfigurationException( 'When using an IterableDataset for `limit_{mode}_batches`,' f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies' - f' num_{mode}_batches to use.') + f' `num_{mode}_batches` to use.') if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float): min_pct = 1.0 / len(dataloader) From 9a13fff79ad74f846b9b0439992eb5df31622dbc Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 5 Aug 2020 17:04:14 +0200 Subject: [PATCH 31/39] fix --- tests/models/test_onnx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py index d7cc7cffaec3f7..278465941a043a 100644 --- a/tests/models/test_onnx.py +++ b/tests/models/test_onnx.py @@ -85,7 +85,7 @@ def test_error_if_no_input(tmpdir): model = EvalModelTemplate() model.example_input_array = None file_path = os.path.join(tmpdir, "model.onxx") - with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'): + with pytest.raises(ValueError, match=r'`input_sample` and `example_input_array` tensors are both missing'): model.to_onnx(file_path) From 5b7c000e4525533e4a4347209c2d0bbadb0767f6 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 5 Aug 2020 17:06:49 +0200 Subject: [PATCH 32/39] max --- pytorch_lightning/trainer/data_loading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 53d624d46c0d9d..c3567624b9821b 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -219,6 +219,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None: if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) elif self.num_training_batches != float('inf'): + self.num_training_batches = min(1.0, self.num_training_batches) self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) elif self.limit_train_batches != 1.0: raise MisconfigurationException( @@ -312,6 +313,7 @@ def _reset_eval_dataloader( if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: num_batches = min(num_batches, int(limit_eval_batches)) elif num_batches != float('inf'): + num_batches = min(1.0, num_batches) num_batches = int(num_batches * limit_eval_batches) elif limit_eval_batches != 1.0: raise MisconfigurationException( From 71a773954037d4f596f916ae49569c4133cf1076 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Wed, 5 Aug 2020 22:10:28 +0530 Subject: [PATCH 33/39] check --- docs/source/sequences.rst | 4 ++-- pytorch_lightning/trainer/data_loading.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst index bb438154e39091..b9a8f2ee642aad 100644 --- a/docs/source/sequences.rst +++ b/docs/source/sequences.rst @@ -59,10 +59,10 @@ Iterable Datasets Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural option when using sequential data. -.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or an int +.. note:: When using an IterableDataset you must set the ``val_check_interval`` to 1.0 (the default) or an int (specifying the number of training batches to run before validation) when initializing the Trainer. This is because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation - interval when ``val_check_interval`` is less than one. Similarly, you can set limit_{mode}_batches to a float or + interval when ``val_check_interval`` is less than one. Similarly, you can set ``limit_{mode}_batches`` to a float or an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches`` to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception. Here mode can be train/val/test. diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index c3567624b9821b..4eec847580636f 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -219,13 +219,12 @@ def reset_train_dataloader(self, model: LightningModule) -> None: if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) elif self.num_training_batches != float('inf'): - self.num_training_batches = min(1.0, self.num_training_batches) self.num_training_batches = int(self.num_training_batches * self.limit_train_batches) elif self.limit_train_batches != 1.0: raise MisconfigurationException( 'When using an IterableDataset for `limit_train_batches`,' ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies' - ' num_training_batches to use.') + ' `num_training_batches` to use.') # determine when to check validation # if int passed in, val checks that often @@ -305,6 +304,7 @@ def _reset_eval_dataloader( for i, dataloader in enumerate(dataloaders): num_batches = len(dataloader) if _has_len(dataloader) else float('inf') self._worker_check(dataloader, f'{mode} dataloader {i}') + self._check_batch_limits(f'limit_{mode}_batches') # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') @@ -313,7 +313,6 @@ def _reset_eval_dataloader( if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0: num_batches = min(num_batches, int(limit_eval_batches)) elif num_batches != float('inf'): - num_batches = min(1.0, num_batches) num_batches = int(num_batches * limit_eval_batches) elif limit_eval_batches != 1.0: raise MisconfigurationException( From 3685e3a490196647d18658f67ed7271b9892c9ed Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 6 Aug 2020 01:56:42 +0530 Subject: [PATCH 34/39] check --- pytorch_lightning/trainer/data_loading.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 4eec847580636f..edfe4b72f35b28 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -385,9 +385,6 @@ def request_dataloader(self, dataloader_fx: Callable) -> DataLoader: def determine_data_use_amount(self, overfit_batches: float) -> None: """Use less data for debugging purposes""" if overfit_batches > 0: - if isinstance(overfit_batches, float) and overfit_batches > 1: - raise ValueError('`overfit_batches` when used as a percentage must' - f' be in range 0.0 < x < 1.0 but got {overfit_batches:.3f}.') self.limit_train_batches = overfit_batches self.limit_val_batches = overfit_batches self.limit_test_batches = overfit_batches From a0dfd845b02db9b1e6ebae1cb2e901c983955036 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 6 Aug 2020 02:17:27 +0530 Subject: [PATCH 35/39] check --- pytorch_lightning/trainer/data_loading.py | 19 ------------------- pytorch_lightning/trainer/trainer.py | 8 +++----- tests/trainer/test_dataloaders.py | 8 +++----- 3 files changed, 6 insertions(+), 29 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index edfe4b72f35b28..38a1118118a403 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -103,21 +103,6 @@ class TrainerDataLoadingMixin(ABC): def is_overridden(self, *args): """Warning: this is just empty shell for code implemented in other class.""" - def _check_batch_limits(self, name: str) -> None: - # TODO: verify it is still needed and deprecate it.. - value = getattr(self, name) - - # ints are fine - if isinstance(value, int): - return - - msg = f'`{name}` must lie in the range [0.0, 1.0], but got {value:.3f}. (or pass in an int)' - if name == 'val_check_interval': - msg += ' If you want to disable validation set `limit_val_batches` to 0.0 instead.' - - if not 0. <= value <= 1.: - raise ValueError(msg) - def _worker_check(self, dataloader: DataLoader, name: str) -> None: on_windows = platform.system() == 'Windows' @@ -214,7 +199,6 @@ def reset_train_dataloader(self, model: LightningModule) -> None: self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf') self._worker_check(self.train_dataloader, 'train dataloader') - self._check_batch_limits('limit_train_batches') if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0: self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches)) @@ -246,8 +230,6 @@ def reset_train_dataloader(self, model: LightningModule) -> None: ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies' ' checking validation every k training batches.') else: - self._check_batch_limits('val_check_interval') - self.val_check_batch = int(self.num_training_batches * self.val_check_interval) self.val_check_batch = max(1, self.val_check_batch) @@ -304,7 +286,6 @@ def _reset_eval_dataloader( for i, dataloader in enumerate(dataloaders): num_batches = len(dataloader) if _has_len(dataloader) else float('inf') self._worker_check(dataloader, f'{mode} dataloader {i}') - self._check_batch_limits(f'limit_{mode}_batches') # percent or num_steps limit_eval_batches = getattr(self, f'limit_{mode}_batches') diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4b342328df2979..a0e89f586c6059 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -534,7 +534,6 @@ def __init__( # logging self.configure_logger(logger) self.log_save_interval = log_save_interval - self.val_check_interval = val_check_interval self.row_log_interval = row_log_interval # how much of the data to use @@ -547,9 +546,6 @@ def __init__( ) overfit_batches = overfit_pct - # convert floats to ints - self.overfit_batches = _determine_limit_batches(overfit_batches) - # TODO: remove in 0.10.0 if val_percent_check is not None: rank_zero_warn( @@ -580,6 +576,8 @@ def __init__( self.limit_test_batches = _determine_limit_batches(limit_test_batches) self.limit_val_batches = _determine_limit_batches(limit_val_batches) self.limit_train_batches = _determine_limit_batches(limit_train_batches) + self.val_check_interval = _determine_limit_batches(val_check_interval) + self.overfit_batches = _determine_limit_batches(overfit_batches) self.determine_data_use_amount(self.overfit_batches) # AMP init @@ -1437,5 +1435,5 @@ def _determine_limit_batches(batches: Union[int, float]) -> Union[int, float]: return int(batches) else: raise MisconfigurationException( - f'You have passed invalid value {batches}, it has to be in (0, 1) or nature number.' + f'You have passed invalid value {batches}, it has to be in (0, 1) or an int.' ) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 1aad5047855a2e..854d5770720a23 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -53,19 +53,15 @@ def test_fit_val_loader_only(tmpdir): @pytest.mark.parametrize("dataloader_options", [ - dict(val_check_interval=1.1), dict(val_check_interval=10000), ]) def test_dataloader_config_errors_runtime(tmpdir, dataloader_options): - model = EvalModelTemplate() - trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, **dataloader_options, ) - with pytest.raises(ValueError): # fit model trainer.fit(model) @@ -78,9 +74,11 @@ def test_dataloader_config_errors_runtime(tmpdir, dataloader_options): dict(limit_val_batches=1.2), dict(limit_test_batches=-0.1), dict(limit_test_batches=1.2), + dict(val_check_interval=1.1), + dict(overfit_batches=1.1), ]) def test_dataloader_config_errors_init(tmpdir, dataloader_options): - with pytest.raises(MisconfigurationException): + with pytest.raises(MisconfigurationException, match='passed invalid value'): Trainer( default_root_dir=tmpdir, max_epochs=1, From aefd79986acbb59ab4b758964b83e0a42249ccd3 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 6 Aug 2020 02:25:57 +0530 Subject: [PATCH 36/39] chlog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0c2e39a428f34..2396dab38539fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562)) -- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787)) +- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2840](https://github.com/PyTorchLightning/pytorch-lightning/pull/2840)) ### Changed From ff7353e8a46c8db2dde3c2f5226d79745111a08d Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 6 Aug 2020 02:32:41 +0530 Subject: [PATCH 37/39] tests --- tests/trainer/test_dataloaders.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 854d5770720a23..c4a6ad8d10aed4 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -74,8 +74,10 @@ def test_dataloader_config_errors_runtime(tmpdir, dataloader_options): dict(limit_val_batches=1.2), dict(limit_test_batches=-0.1), dict(limit_test_batches=1.2), - dict(val_check_interval=1.1), - dict(overfit_batches=1.1), + dict(val_check_interval=-0.1), + dict(val_check_interval=1.2), + dict(overfit_batches=-0.1), + dict(overfit_batches=1.2), ]) def test_dataloader_config_errors_init(tmpdir, dataloader_options): with pytest.raises(MisconfigurationException, match='passed invalid value'): From cf3935c3aca1ef2fbf623f0523d50b3374d979cf Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Thu, 6 Aug 2020 22:59:21 +0530 Subject: [PATCH 38/39] update exception message --- pytorch_lightning/trainer/trainer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index a0e89f586c6059..ea8d7941d95f60 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -573,11 +573,11 @@ def __init__( ) limit_train_batches = train_percent_check - self.limit_test_batches = _determine_limit_batches(limit_test_batches) - self.limit_val_batches = _determine_limit_batches(limit_val_batches) - self.limit_train_batches = _determine_limit_batches(limit_train_batches) - self.val_check_interval = _determine_limit_batches(val_check_interval) - self.overfit_batches = _determine_limit_batches(overfit_batches) + self.limit_train_batches = _determine_batch_limits(limit_train_batches, 'limit_train_batches') + self.limit_val_batches = _determine_batch_limits(limit_val_batches, 'limit_val_batches') + self.limit_test_batches = _determine_batch_limits(limit_test_batches, 'limit_test_batches') + self.val_check_interval = _determine_batch_limits(val_check_interval, 'val_check_interval') + self.overfit_batches = _determine_batch_limits(overfit_batches, 'overfit_batches') self.determine_data_use_amount(self.overfit_batches) # AMP init @@ -1428,12 +1428,12 @@ def __call__(self) -> Union[List[DataLoader], DataLoader]: return self.dataloader -def _determine_limit_batches(batches: Union[int, float]) -> Union[int, float]: +def _determine_batch_limits(batches: Union[int, float], name: str) -> Union[int, float]: if 0 <= batches <= 1: return batches elif batches > 1 and batches % 1.0 == 0: return int(batches) else: raise MisconfigurationException( - f'You have passed invalid value {batches}, it has to be in (0, 1) or an int.' + f'You have passed invalid value {batches} for {name}, it has to be in [0.0, 1.0] or an int.' ) From cb7d3a736f50df9074b3f21716dbb37c7652f25d Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 6 Aug 2020 21:16:56 +0200 Subject: [PATCH 39/39] Apply suggestions from code review --- tests/trainer/test_dataloaders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index c4a6ad8d10aed4..27dc7ac89100bb 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -281,11 +281,11 @@ def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, results = trainer.fit(model) assert results == 1 - assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf') - assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf') + assert trainer.num_training_batches == (0 if limit_train_batches == 0.0 else float('inf')) + assert trainer.num_val_batches[0] == (0 if limit_val_batches == 0.0 else float('inf')) trainer.test(ckpt_path=None) - assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf') + assert trainer.num_test_batches[0] == (0 if limit_test_batches == 0.0 else float('inf')) @pytest.mark.parametrize(