From de9c9f0864418a83f295e4c87be50e12645bd83a Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Wed, 5 Aug 2020 22:34:49 +0530
Subject: [PATCH 01/39] Support limit_mode_batches (int) for infinite
 dataloader (#2787)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Support limit_mode_batches(int) for infinite dataloader

* flake8

* revert and update

* add and update tests

* pep8

* chlog

* Update CHANGELOG.md

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Add suggestions by @awaelchli

* docs

* Apply suggestions from code review

Co-authored-by: Ethan Harris <ewah1g13@soton.ac.uk>

* Apply suggestions from code review

* fix

* max

* check

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: Ethan Harris <ewah1g13@soton.ac.uk>
Co-authored-by: Jirka Borovec <jirka@pytorchlightning.ai>
---
 CHANGELOG.md                                 |  2 +
 docs/source/sequences.rst                    | 21 ++++--
 pytorch_lightning/core/lightning.py          |  2 +-
 pytorch_lightning/trainer/data_loading.py    | 45 ++++++-----
 pytorch_lightning/trainer/training_tricks.py |  2 +-
 tests/models/test_onnx_save.py               |  2 +-
 tests/trainer/test_dataloaders.py            | 79 ++++++++++++++++++--
 7 files changed, 112 insertions(+), 41 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a56caac77a16f5..f1300140f229b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562))
 
+- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787))
+
 ### Changed
 
 - Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594))
diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index e24ee5bbca1cc9..b9a8f2ee642aad 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -49,8 +49,8 @@ Lightning can handle TBTT automatically via this flag.
 .. note:: If you need to modify how the batch is split,
     override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`.
 
-.. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include
-    a `hiddens` arg.
+.. note:: Using this feature requires updating your LightningModule's
+    :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg.
 
 ----------
 
@@ -59,10 +59,13 @@ Iterable Datasets
 Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural
 option when using sequential data.
 
-.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or to an int
-    (specifying the number of training batches to run before validation) when initializing the Trainer.
-    This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate
-    the validation interval when val_check_interval is less than one.
+.. note:: When using an IterableDataset you must set the ``val_check_interval`` to 1.0 (the default) or an int
+    (specifying the number of training batches to run before validation) when initializing the Trainer. This is
+    because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation
+    interval when ``val_check_interval`` is less than one. Similarly, you can set ``limit_{mode}_batches`` to a float or
+    an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches``
+    to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception.
+    Here mode can be train/val/test.
 
 .. testcode::
 
@@ -87,3 +90,9 @@ option when using sequential data.
 
     # Set val_check_interval
     trainer = Trainer(val_check_interval=100)
+
+    # Set limit_val_batches to 0.0 or 0
+    trainer = Trainer(limit_val_batches=0.0)
+
+    # Set limit_val_batches as an int
+    trainer = Trainer(limit_val_batches=100)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 36b3ec3229ee39..ea9898da5214e0 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1754,7 +1754,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg
         elif self.example_input_array is not None:
             input_data = self.example_input_array
         else:
-            raise ValueError(f'input_sample and example_input_array tensors are both missing.')
+            raise ValueError('`input_sample` and `example_input_array` tensors are both missing.')
 
         if 'example_outputs' not in kwargs:
             self.eval()
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 09186765c6eeec..4eec847580636f 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -212,18 +212,19 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         # automatically add samplers
         self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True)
 
+        self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf')
         self._worker_check(self.train_dataloader, 'train dataloader')
         self._check_batch_limits('limit_train_batches')
 
-        if not _has_len(self.train_dataloader):
-            self.num_training_batches = float('inf')
-        else:
-            # try getting the length
-            if isinstance(self.limit_train_batches, float):
-                self.num_training_batches = len(self.train_dataloader)
-                self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
-            else:
-                self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches)
+        if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
+            self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
+        elif self.num_training_batches != float('inf'):
+            self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
+        elif self.limit_train_batches != 1.0:
+            raise MisconfigurationException(
+                'When using an IterableDataset for `limit_train_batches`,'
+                ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
+                ' `num_training_batches` to use.')
 
         # determine when to check validation
         # if int passed in, val checks that often
@@ -241,8 +242,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
                     self.val_check_batch = float('inf')
                 else:
                     raise MisconfigurationException(
-                        'When using an infinite DataLoader (e.g. with an IterableDataset'
-                        ' or when DataLoader does not implement `__len__`) for `train_dataloader`,'
+                        'When using an IterableDataset for `train_dataloader`,'
                         ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies'
                         ' checking validation every k training batches.')
             else:
@@ -304,24 +304,21 @@ def _reset_eval_dataloader(
             for i, dataloader in enumerate(dataloaders):
                 num_batches = len(dataloader) if _has_len(dataloader) else float('inf')
                 self._worker_check(dataloader, f'{mode} dataloader {i}')
+                self._check_batch_limits(f'limit_{mode}_batches')
 
                 # percent or num_steps
                 limit_eval_batches = getattr(self, f'limit_{mode}_batches')
 
-                if num_batches != float('inf'):
-                    self._check_batch_limits(f'limit_{mode}_batches')
-
-                    # limit num batches either as a percent or num steps
-                    if isinstance(limit_eval_batches, float):
-                        num_batches = int(num_batches * limit_eval_batches)
-                    else:
-                        num_batches = min(len(dataloader), limit_eval_batches)
-
-                elif limit_eval_batches not in (0.0, 1.0):
+                # limit num batches either as a percent or num steps
+                if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
+                    num_batches = min(num_batches, int(limit_eval_batches))
+                elif num_batches != float('inf'):
+                    num_batches = int(num_batches * limit_eval_batches)
+                elif limit_eval_batches != 1.0:
                     raise MisconfigurationException(
-                        'When using an infinite DataLoader (e.g. with an IterableDataset'
-                        f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,'
-                        f' `Trainer(limit_{mode}_batches)` must be `0.0` or `1.0`.')
+                        'When using an IterableDataset for `limit_{mode}_batches`,'
+                        f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
+                        f' `num_{mode}_batches` to use.')
 
                 if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                     min_pct = 1.0 / len(dataloader)
diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py
index 20eeff3878cc20..5bea8fbc1a3cd6 100644
--- a/pytorch_lightning/trainer/training_tricks.py
+++ b/pytorch_lightning/trainer/training_tricks.py
@@ -269,7 +269,7 @@ def _adjust_batch_size(trainer,
         if hasattr(model, batch_arg_name):
             setattr(model, batch_arg_name, value)
         else:
-            setattr(model.hparams, batch_arg_name, value)        
+            setattr(model.hparams, batch_arg_name, value)
         new_size = value
         if desc:
             log.info(f'Batch size {batch_size} {desc}, trying batch size {new_size}')
diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx_save.py
index f824f33c93bc14..7cb40561f77318 100644
--- a/tests/models/test_onnx_save.py
+++ b/tests/models/test_onnx_save.py
@@ -84,7 +84,7 @@ def test_error_if_no_input(tmpdir):
     model = EvalModelTemplate()
     model.example_input_array = None
     file_path = os.path.join(tmpdir, "model.onxx")
-    with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'):
+    with pytest.raises(ValueError, match=r'`input_sample` and `example_input_array` tensors are both missing'):
         model.to_onnx(file_path)
 
 
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 1c7e21b7a72bb5..1aad5047855a2e 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -256,6 +256,69 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path):
         f'Multiple `test_dataloaders` not initiated properly, got {trainer.test_dataloaders}'
 
 
+@pytest.mark.parametrize(
+    ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
+    [
+        pytest.param(0.0, 0.0, 0.0),
+        pytest.param(1.0, 1.0, 1.0),
+    ]
+)
+def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches,
+                                                    limit_val_batches, limit_test_batches):
+    """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent"""
+    model = EvalModelTemplate()
+    model.train_dataloader = model.train_dataloader__infinite
+    model.val_dataloader = model.val_dataloader__infinite
+    model.test_dataloader = model.test_dataloader__infinite
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=limit_val_batches,
+        limit_test_batches=limit_test_batches,
+    )
+
+    results = trainer.fit(model)
+    assert results == 1
+    assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf')
+    assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf')
+
+    trainer.test(ckpt_path=None)
+    assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf')
+
+
+@pytest.mark.parametrize(
+    ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
+    [
+        pytest.param(0, 0, 0),
+        pytest.param(10, 10, 10),
+    ]
+)
+def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
+    """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number"""
+    model = EvalModelTemplate()
+    model.train_dataloader = model.train_dataloader__infinite
+    model.val_dataloader = model.val_dataloader__infinite
+    model.test_dataloader = model.test_dataloader__infinite
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=limit_val_batches,
+        limit_test_batches=limit_test_batches,
+    )
+
+    results = trainer.fit(model)
+    assert results
+    assert trainer.num_training_batches == limit_train_batches
+    assert trainer.num_val_batches[0] == limit_val_batches
+
+    trainer.test(ckpt_path=None)
+    assert trainer.num_test_batches[0] == limit_test_batches
+
+
 @pytest.mark.parametrize(
     ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
     [
@@ -266,7 +329,7 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path):
     ]
 )
 def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify num_batches for val & test dataloaders passed with batch limit in percent"""
+    """Verify num_batches for train, val & test dataloaders passed with batch limit in percent"""
     model = EvalModelTemplate()
     model.val_dataloader = model.val_dataloader__multiple_mixed_length
     model.test_dataloader = model.test_dataloader__multiple_mixed_length
@@ -307,7 +370,7 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim
     ]
 )
 def test_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify num_batches for val & test dataloaders passed with batch limit as number"""
+    """Verify num_batches for train, val & test dataloaders passed with batch limit as number"""
     os.environ['PL_DEV_DEBUG'] = '1'
 
     model = EvalModelTemplate()
@@ -436,7 +499,7 @@ def test_train_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -447,7 +510,7 @@ def test_val_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -458,7 +521,7 @@ def test_test_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.test(model)
 
 
@@ -774,7 +837,7 @@ def test_train_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, val_check_interval=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -785,7 +848,7 @@ def test_val_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_val_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -796,5 +859,5 @@ def test_test_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_test_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.test(model)

From e31c520c21e1b2090d4cf889a2daac4188e9e2ce Mon Sep 17 00:00:00 2001
From: Ananya Harsh Jha <ananya@pytorchlightning.ai>
Date: Wed, 5 Aug 2020 13:29:05 -0400
Subject: [PATCH 02/39] add support for sync_bn (#2801)

* initial commit for sync_bn

* updated changelog

* tests

* tests

* ddp tests hanging with script tests

* updated trainer

* updated params

* test

* passingtests

* passing tests

* passing tests

* passing tests

* tests

* removed apex

* doc

* doc

* doc

* doc

* docs

* tests

* tests

* tests
---
 CHANGELOG.md                                  |   2 +
 pl_examples/basic_examples/sync_bn.py         | 204 ++++++++++++++++++
 pl_examples/test_examples.py                  |  14 ++
 .../accelerator_backends/ddp_backend.py       |   4 +
 .../accelerator_backends/ddp_spawn_backend.py |   4 +
 pytorch_lightning/core/lightning.py           |  17 ++
 pytorch_lightning/trainer/trainer.py          |   6 +
 7 files changed, 251 insertions(+)
 create mode 100644 pl_examples/basic_examples/sync_bn.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f1300140f229b7..00fa8d1bf8985f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added SyncBN for DDP ([#2801](https://github.com/PyTorchLightning/pytorch-lightning/pull/2801))
+
 - Added SSIM metrics ([#2671](https://github.com/PyTorchLightning/pytorch-lightning/pull/2671))
 
 - Added BLEU metrics ([#2535](https://github.com/PyTorchLightning/pytorch-lightning/pull/2535))
diff --git a/pl_examples/basic_examples/sync_bn.py b/pl_examples/basic_examples/sync_bn.py
new file mode 100644
index 00000000000000..bb602a74ea89c9
--- /dev/null
+++ b/pl_examples/basic_examples/sync_bn.py
@@ -0,0 +1,204 @@
+"""
+Sync-bn with DDP (GPU)
+
+This code is to verify that batch statistics are synchronized across GPUs using sync-bn.
+When sync_bn is set to True the training loop should run for 3 iterations.
+When sync_bn is set to False, the code should result in an AssertionError.
+"""
+import os
+import math
+import numpy as np
+from argparse import ArgumentParser
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+
+import torchvision.transforms as transforms
+from torchvision.datasets import MNIST
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+
+pl.seed_everything(234)
+FLOAT16_EPSILON = np.finfo(np.float16).eps
+
+
+class MNISTDataModule(pl.LightningDataModule):
+    def __init__(self, data_dir: str = './', batch_size=32, dist_sampler=False):
+        super().__init__()
+
+        self.dist_sampler = dist_sampler
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+
+        self.transforms = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,))
+        ])
+
+        # self.dims is returned when you call dm.size()
+        # Setting default dims here because we know them.
+        # Could optionally be assigned dynamically in dm.setup()
+        self.dims = (1, 28, 28)
+
+    def prepare_data(self):
+        # download only
+        MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
+        MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
+
+    def setup(self, stage=None):
+
+        # Assign train/val datasets for use in dataloaders
+        if stage == 'fit' or stage is None:
+            self.mnist_train = MNIST(self.data_dir, train=True, transform=self.transforms)
+
+        # Assign test dataset for use in dataloader(s)
+        if stage == 'test' or stage is None:
+            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transforms)
+
+    def train_dataloader(self):
+        dist_sampler = None
+        if self.dist_sampler:
+            dist_sampler = DistributedSampler(self.mnist_train, shuffle=False)
+
+        return DataLoader(
+            self.mnist_train, batch_size=self.batch_size, sampler=dist_sampler, shuffle=False
+        )
+
+    def test_dataloader(self):
+        return DataLoader(self.mnist_test, batch_size=self.batch_size, shuffle=False)
+
+
+class SyncBNModule(pl.LightningModule):
+    def __init__(self, gpu_count=1, **kwargs):
+        super().__init__()
+
+        self.gpu_count = gpu_count
+        self.bn_targets = None
+        if 'bn_targets' in kwargs:
+            self.bn_targets = kwargs['bn_targets']
+
+        self.linear = nn.Linear(28 * 28, 10)
+        self.bn_layer = nn.BatchNorm1d(28 * 28)
+
+    def forward(self, x, batch_idx):
+        with torch.no_grad():
+            out_bn = self.bn_layer(x.view(x.size(0), -1))
+
+            if self.bn_targets:
+                bn_target = self.bn_targets[batch_idx]
+
+                # executes on both GPUs
+                bn_target = bn_target[self.trainer.local_rank::self.gpu_count]
+                bn_target = bn_target.to(out_bn.device)
+                assert torch.sum(torch.abs(bn_target - out_bn)) < FLOAT16_EPSILON
+
+        out = self.linear(out_bn)
+
+        return out, out_bn
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+
+        y_hat, _ = self(x, batch_idx)
+        loss = F.cross_entropy(y_hat, y)
+
+        return pl.TrainResult(loss)
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.linear.parameters(), lr=0.02)
+
+    @staticmethod
+    def add_model_specific_argument(parent_parser, root_dir):
+        """
+        Define parameters that only apply to this model
+        """
+        parser = ArgumentParser(parents=[parent_parser])
+
+        parser.add_argument('--nodes', default=1, type=int)
+        parser.add_argument('--gpu', default=2, type=int)
+        parser.add_argument('--dist_backend', default='ddp', type=str)
+
+        parser.add_argument('--epochs', default=1, type=int)
+        parser.add_argument('--steps', default=3, type=int)
+
+        parser.add_argument('--bn_sync', action='store_true')
+
+        return parser
+
+
+def main(args, datamodule, bn_outputs):
+    """Main training routine specific for this project."""
+    # ------------------------
+    # 1 INIT LIGHTNING MODEL
+    # ------------------------
+    model = SyncBNModule(gpu_count=args.gpu, bn_targets=bn_outputs)
+
+    # ------------------------
+    # 2 INIT TRAINER
+    # ------------------------
+    trainer = pl.Trainer(
+        gpus=args.gpu,
+        num_nodes=args.nodes,
+        distributed_backend=args.dist_backend,
+        max_epochs=args.epochs,
+        max_steps=args.steps,
+        sync_bn=args.bn_sync,
+        num_sanity_val_steps=0,
+        replace_sampler_ddp=False,
+    )
+
+    # ------------------------
+    # 3 START TRAINING
+    # ------------------------
+    trainer.fit(model, datamodule)
+
+
+def run_cli():
+    root_dir = os.path.dirname(os.path.realpath(__file__))
+    parent_parser = ArgumentParser(add_help=False)
+
+    # define datamodule and dataloader
+    dm = MNISTDataModule()
+    dm.prepare_data()
+    dm.setup(stage=None)
+
+    train_dataloader = dm.train_dataloader()
+    model = SyncBNModule()
+
+    bn_outputs = []
+
+    # shuffle is false by default
+    for batch_idx, batch in enumerate(train_dataloader):
+        x, y = batch
+
+        out, out_bn = model.forward(x, batch_idx)
+        bn_outputs.append(out_bn)
+
+        # get 3 steps
+        if batch_idx == 2:
+            break
+
+    bn_outputs = [x.cuda() for x in bn_outputs]
+
+    # reset datamodule
+    # batch-size = 16 because 2 GPUs in DDP
+    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
+    dm.prepare_data()
+    dm.setup(stage=None)
+
+    # each LightningModule defines arguments relevant to it
+    parser = SyncBNModule.add_model_specific_argument(parent_parser, root_dir=root_dir)
+    parser = pl.Trainer.add_argparse_args(parser)
+    args = parser.parse_args()
+
+    # ---------------------
+    # RUN TRAINING
+    # ---------------------
+    main(args, dm, bn_outputs)
+
+
+if __name__ == '__main__':
+    run_cli()
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
index 330135e8ea78ab..d527354647f075 100644
--- a/pl_examples/test_examples.py
+++ b/pl_examples/test_examples.py
@@ -25,6 +25,20 @@ def test_gpu_template(cli_args):
         run_cli()
 
 
+@pytest.mark.parametrize(
+    'cli_args',
+    ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2 --dist_backend ddp_spawn --bn_sync']
+)
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_sync_bn(cli_args):
+    """Test running CLI for an example with sync bn."""
+    from pl_examples.basic_examples.sync_bn import run_cli
+
+    cli_args = cli_args.split(' ') if cli_args else []
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        run_cli()
+
+
 # @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2'])
 # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 # def test_multi_node_ddp(cli_args):
diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py
index 0b90a834746127..c2e549c18ef1a9 100644
--- a/pytorch_lightning/accelerator_backends/ddp_backend.py
+++ b/pytorch_lightning/accelerator_backends/ddp_backend.py
@@ -176,6 +176,10 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         self.trainer.lr_schedulers = lr_schedulers
         self.trainer.optimizer_frequencies = optimizer_frequencies
 
+        # call sync_bn before .cuda(), configure_apex and configure_ddp
+        if self.trainer.sync_bn:
+            model = model.configure_sync_bn(model)
+
         # MODEL
         # copy model to each gpu
         if self.trainer.on_gpu:
diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
index 8301cd8a71e162..85de2d1b7759ec 100644
--- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
+++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
@@ -118,6 +118,10 @@ def ddp_train(self, process_idx, mp_queue, model):
         self.trainer.lr_schedulers = lr_schedulers
         self.trainer.optimizer_frequencies = optimizer_frequencies
 
+        # call sync_bn before .cuda(), configure_apex and configure_ddp
+        if self.trainer.sync_bn:
+            model = model.configure_sync_bn(model)
+
         # MODEL
         # copy model to each gpu
         if self.trainer.on_gpu:
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index ea9898da5214e0..c09d981d1d5e3b 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -957,6 +957,23 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi
         log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}")
         torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
 
+    def configure_sync_bn(self, model: 'LightningModule') -> 'LightningModule':
+        """
+        Add global batchnorm for a model spread across multiple GPUs and nodes.
+
+        Override to synchronize batchnorm between specific process groups instead
+        of the whole world or use a different sync_bn like `apex`'s version.
+
+        Args:
+            model: pointer to current :class:`LightningModule`.
+
+        Return:
+            LightningModule with batchnorm layers synchronized between process groups
+        """
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None)
+
+        return model
+
     def configure_apex(
         self, amp: object, model: 'LightningModule', optimizers: List[Optimizer], amp_level: str
     ) -> Tuple['LightningModule', List[Optimizer]]:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 323f2866b1cabf..ebfe680fc3372b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -184,6 +184,7 @@ def __init__(
         log_save_interval: int = 100,
         row_log_interval: int = 50,
         distributed_backend: Optional[str] = None,
+        sync_bn: bool = False,
         precision: int = 32,
         weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT,
         weights_save_path: Optional[str] = None,
@@ -296,6 +297,8 @@ def __init__(
 
             distributed_backend: The distributed backend to use (dp, ddp, ddp2, ddp_spawn, ddp_cpu)
 
+            sync_bn: Synchronize batch norm layers between process groups/whole world.
+
             precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs.
 
             weights_summary: Prints a summary of the weights when training begins.
@@ -427,6 +430,9 @@ def __init__(
         self.num_nodes = num_nodes
         self.log_gpu_memory = log_gpu_memory
 
+        # sync-bn backend
+        self.sync_bn = sync_bn
+
         self.gradient_clip_val = gradient_clip_val
         self.check_val_every_n_epoch = check_val_every_n_epoch
 

From 2cbb1496d01213329b8f1c31936d26db8b2338b5 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 5 Aug 2020 13:37:11 -0400
Subject: [PATCH 03/39] Update __init__.py

---
 pytorch_lightning/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 87d62347b47089..6b948f5e07528d 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '0.9.0rc6'
+__version__ = '0.9.0rc7'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From 5bbcb8db1f1f9b8803e08085d8fe8975c89070e6 Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Thu, 6 Aug 2020 00:10:11 +0630
Subject: [PATCH 04/39] Improve SSIM (#2833)

* make ssim fast

* remove padding

* pep8

* add comments for readability

* plus -> coef
---
 .../metrics/functional/regression.py          | 30 +++++++++++-------
 pytorch_lightning/metrics/regression.py       |  4 +--
 tests/metrics/functional/test_regression.py   | 31 ++++++++++---------
 tests/metrics/test_regression.py              |  2 +-
 4 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/pytorch_lightning/metrics/functional/regression.py b/pytorch_lightning/metrics/functional/regression.py
index 68f7bef93f7ea0..84f50d37270ca8 100644
--- a/pytorch_lightning/metrics/functional/regression.py
+++ b/pytorch_lightning/metrics/functional/regression.py
@@ -236,9 +236,9 @@ def ssim(
     Example:
 
         >>> pred = torch.rand([16, 1, 16, 16])
-        >>> target = pred * 1.25
+        >>> target = pred * 0.75
         >>> ssim(pred, target)
-        tensor(0.9520)
+        tensor(0.9219)
     """
 
     if pred.dtype != target.dtype:
@@ -280,16 +280,24 @@ def ssim(
 
     channel = pred.size(1)
     kernel = _gaussian_kernel(channel, kernel_size, sigma, device)
-    mu_pred = F.conv2d(pred, kernel, groups=channel)
-    mu_target = F.conv2d(target, kernel, groups=channel)
 
-    mu_pred_sq = mu_pred.pow(2)
-    mu_target_sq = mu_target.pow(2)
-    mu_pred_target = mu_pred * mu_target
-
-    sigma_pred_sq = F.conv2d(pred * pred, kernel, groups=channel) - mu_pred_sq
-    sigma_target_sq = F.conv2d(target * target, kernel, groups=channel) - mu_target_sq
-    sigma_pred_target = F.conv2d(pred * target, kernel, groups=channel) - mu_pred_target
+    # Concatenate
+    # pred for mu_pred
+    # target for mu_target
+    # pred * pred for sigma_pred
+    # target * target for sigma_target
+    # pred * target for sigma_pred_target
+    input_list = torch.cat([pred, target, pred * pred, target * target, pred * target])  # (5 * B, C, H, W)
+    outputs = F.conv2d(input_list, kernel, groups=channel)
+    output_list = [outputs[x * pred.size(0): (x + 1) * pred.size(0)] for x in range(len(outputs))]
+
+    mu_pred_sq = output_list[0].pow(2)
+    mu_target_sq = output_list[1].pow(2)
+    mu_pred_target = output_list[0] * output_list[1]
+
+    sigma_pred_sq = output_list[2] - mu_pred_sq
+    sigma_target_sq = output_list[3] - mu_target_sq
+    sigma_pred_target = output_list[4] - mu_pred_target
 
     UPPER = 2 * sigma_pred_target + C2
     LOWER = sigma_pred_sq + sigma_target_sq + C2
diff --git a/pytorch_lightning/metrics/regression.py b/pytorch_lightning/metrics/regression.py
index 5b69868e1f776c..a2cbaaf4f822a9 100644
--- a/pytorch_lightning/metrics/regression.py
+++ b/pytorch_lightning/metrics/regression.py
@@ -241,10 +241,10 @@ class SSIM(Metric):
     Example:
 
         >>> pred = torch.rand([16, 1, 16, 16])
-        >>> target = pred * 1.25
+        >>> target = pred * 0.75
         >>> metric = SSIM()
         >>> metric(pred, target)
-        tensor(0.9520)
+        tensor(0.9219)
     """
 
     def __init__(
diff --git a/tests/metrics/functional/test_regression.py b/tests/metrics/functional/test_regression.py
index c9df4f1ba3b9e8..cd251c77a98fc1 100644
--- a/tests/metrics/functional/test_regression.py
+++ b/tests/metrics/functional/test_regression.py
@@ -97,24 +97,25 @@ def test_psnr_against_sklearn(sklearn_metric, torch_metric):
         assert torch.allclose(sk_score, pl_score)
 
 
-@pytest.mark.parametrize(['size', 'channel', 'plus', 'multichannel'], [
-    pytest.param(16, 1, 0.125, False),
-    pytest.param(32, 1, 0.25, False),
-    pytest.param(48, 3, 0.5, True),
-    pytest.param(64, 4, 0.75, True),
-    pytest.param(128, 5, 1, True)
+@pytest.mark.parametrize(['size', 'channel', 'coef', 'multichannel'], [
+    pytest.param(16, 1, 0.9, False),
+    pytest.param(32, 3, 0.8, True),
+    pytest.param(48, 4, 0.7, True),
+    pytest.param(64, 5, 0.6, True)
 ])
-def test_ssim(size, channel, plus, multichannel):
+def test_ssim(size, channel, coef, multichannel):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    pred = torch.rand(1, channel, size, size, device=device)
-    target = pred + plus
-    ssim_idx = ssim(pred, target)
-    np_pred = np.random.rand(size, size, channel)
+    pred = torch.rand(size, channel, size, size, device=device)
+    target = pred * coef
+    ssim_idx = ssim(pred, target, data_range=1.0)
+    np_pred = pred.permute(0, 2, 3, 1).cpu().numpy()
     if multichannel is False:
-        np_pred = np_pred[:, :, 0]
-    np_target = np.add(np_pred, plus)
-    sk_ssim_idx = ski_ssim(np_pred, np_target, win_size=11, multichannel=multichannel, gaussian_weights=True)
-    assert torch.allclose(ssim_idx, torch.tensor(sk_ssim_idx, dtype=torch.float, device=device), atol=1e-2, rtol=1e-2)
+        np_pred = np_pred[:, :, :, 0]
+    np_target = np.multiply(np_pred, coef)
+    sk_ssim_idx = ski_ssim(
+        np_pred, np_target, win_size=11, multichannel=multichannel, gaussian_weights=True, data_range=1.0
+    )
+    assert torch.allclose(ssim_idx, torch.tensor(sk_ssim_idx, dtype=torch.float, device=device), atol=1e-4)
 
     ssim_idx = ssim(pred, pred)
     assert torch.allclose(ssim_idx, torch.tensor(1.0, device=device))
diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py
index 955e6253e3225b..e5ecd51c775b90 100644
--- a/tests/metrics/test_regression.py
+++ b/tests/metrics/test_regression.py
@@ -65,6 +65,6 @@ def test_ssim():
     assert ssim.name == 'ssim'
 
     pred = torch.rand([16, 1, 16, 16])
-    target = pred * 1.25
+    target = pred * 0.75
     score = ssim(pred, target)
     assert isinstance(score, torch.Tensor)

From 6034d5e37d508bda133e9cbf4d0f590c7d173f56 Mon Sep 17 00:00:00 2001
From: "Ruotian(RT) Luo" <rluo@ttic.edu>
Date: Wed, 5 Aug 2020 12:42:21 -0500
Subject: [PATCH 05/39] fix apex gradient clipping (#2829)

---
 pytorch_lightning/trainer/training_loop.py   |  4 ++--
 pytorch_lightning/trainer/training_tricks.py | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index e0a7b43a872aa3..c8cb81ed090b19 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -291,7 +291,7 @@ def transfer_batch_to_tpu(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
     @abstractmethod
-    def clip_gradients(self):
+    def clip_gradients(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
     @abstractmethod
@@ -817,7 +817,7 @@ def run_batch_backward_pass(self, split_batch, batch_idx, opt_idx, optimizer):
         # ------------------
         if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
             self.scaler.unscale_(optimizer)
-        self.clip_gradients()
+        self.clip_gradients(optimizer)
 
         # ------------------
         # .STEP + ZERO_GRAD
diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py
index 5bea8fbc1a3cd6..44b66407c7645d 100644
--- a/pytorch_lightning/trainer/training_tricks.py
+++ b/pytorch_lightning/trainer/training_tricks.py
@@ -27,9 +27,17 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
 from pytorch_lightning.loggers.base import DummyLogger
+from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.memory import is_oom_error, garbage_collection_cuda
 
+try:
+    from apex import amp
+except ImportError:
+    APEX_AVAILABLE = False
+else:
+    APEX_AVAILABLE = True
+
 EPSILON = 1e-6
 EPSILON_FP16 = 1e-5
 
@@ -60,14 +68,17 @@ def restore(self, *args):
     def fit(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
-    def clip_gradients(self):
+    def clip_gradients(self, optimizer):
 
         # this code is a modification of torch.nn.utils.clip_grad_norm_
         # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
         if self.gradient_clip_val <= 0:
             return
         model = self.get_model()
-        parameters = model.parameters()
+        if self.use_amp and not NATIVE_AMP_AVALAIBLE:
+            parameters = amp.master_params(optimizer)
+        else:
+            parameters = model.parameters()
         max_norm = float(self.gradient_clip_val)
         norm_type = float(2.0)
         if isinstance(parameters, torch.Tensor):

From bef27c58eda4c4425c8aa750d38e16522bfcbe39 Mon Sep 17 00:00:00 2001
From: "Ruotian(RT) Luo" <rluo@ttic.edu>
Date: Wed, 5 Aug 2020 12:43:50 -0500
Subject: [PATCH 06/39] save apex scaler states (#2828)

---
 pytorch_lightning/trainer/training_io.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index 90f36ecab59c32..666dfbb2588b72 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -115,6 +115,13 @@
 else:
     XLA_AVAILABLE = True
 
+try:
+    from apex import amp
+except ImportError:
+    APEX_AVAILABLE = False
+else:
+    APEX_AVAILABLE = True
+
 try:
     import horovod.torch as hvd
 except (ModuleNotFoundError, ImportError):
@@ -317,6 +324,8 @@ def restore(self, checkpoint_path: str, on_gpu: bool):
         # restore amp scaling
         if self.use_amp and NATIVE_AMP_AVALAIBLE and 'native_amp_scaling_state' in checkpoint:
             self.scaler.load_state_dict(checkpoint['native_amp_scaling_state'])
+        elif self.use_amp and not NATIVE_AMP_AVALAIBLE and 'amp_scaling_state' in checkpoint:
+            amp.load_state_dict(checkpoint['amp_scaling_state'])
 
         # load training state (affects trainer only)
         self.restore_training_state(checkpoint)
@@ -368,6 +377,8 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             # save native amp scaling
             if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu:
                 checkpoint['native_amp_scaling_state'] = self.scaler.state_dict()
+            elif self.use_amp and not NATIVE_AMP_AVALAIBLE:
+                checkpoint['amp_scaling_state'] = amp.state_dict()
 
         # add the module_arguments and state_dict from the model
         model = self.get_model()
@@ -523,6 +534,8 @@ def hpc_load(self, folderpath, on_gpu):
         # restore amp scaling
         if self.use_amp and NATIVE_AMP_AVALAIBLE and 'native_amp_scaling_state' in checkpoint:
             self.scaler.load_state_dict(checkpoint['native_amp_scaling_state'])
+        elif self.use_amp and not NATIVE_AMP_AVALAIBLE and 'amp_scaling_state' in checkpoint:
+            amp.load_state_dict(checkpoint['amp_scaling_state'])
 
         if self.root_gpu is not None:
             model.cuda(self.root_gpu)

From d09098ca5af65f200be2ff2a9a97d6204beff4c7 Mon Sep 17 00:00:00 2001
From: Rosario Scalise <rosario@cs.washington.edu>
Date: Wed, 5 Aug 2020 12:06:26 -0700
Subject: [PATCH 07/39] [DOCS] title clarification in Results page (#2827)

* title tweak

* remove changes in new-project

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 docs/source/results.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/results.rst b/docs/source/results.rst
index a5b01ee42e635b..960cda2bcf399f 100644
--- a/docs/source/results.rst
+++ b/docs/source/results.rst
@@ -40,8 +40,8 @@ using the equivalent syntax via the `TrainResult` object:
 
 --------------------
 
-Validation loop example
------------------------
+Validation/Test loop example
+-----------------------------
 We can replace the following validation/test loop:
 
 .. code-block:: python

From 2242af11b677395898951ef620ad65c64d7603a8 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 5 Aug 2020 21:43:33 +0200
Subject: [PATCH 08/39] another try to filter master from CircleCI jobs (#2734)

* circleci config

* Apply suggestions from code review

* miss
---
 .circleci/config.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9d287432327113..fdb217baff47eb 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -153,11 +153,11 @@ workflows:
           filters:
             branches:
               # https://discuss.circleci.com/t/create-separate-steps-jobs-for-pr-forks-versus-branches/13419/4
-              only:
-                # only from forks
-                - /^pull\/.*$/
-                # only from canonical repository
-                - /^(?!pull\/).*$/
+              #only:
+              #  # only from forks
+              #  - /^pull\/.\d+$/
+              ignore:
+                - master
   cleanup:
     triggers:
       - schedule:

From 5d0f0325d854a901fbde85fab909cd866b30fc7c Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 5 Aug 2020 15:57:26 -0400
Subject: [PATCH 09/39] Revert "Support limit_mode_batches (int) for infinite
 dataloader" (#2839)

* Revert "Support limit_mode_batches (int) for infinite dataloader (#2787)"

This reverts commit de9c9f0864418a83f295e4c87be50e12645bd83a.

* Update training_tricks.py
---
 CHANGELOG.md                              |  2 -
 docs/source/sequences.rst                 | 21 ++----
 pytorch_lightning/core/lightning.py       |  2 +-
 pytorch_lightning/trainer/data_loading.py | 45 +++++++------
 tests/models/test_onnx_save.py            |  2 +-
 tests/trainer/test_dataloaders.py         | 79 +++--------------------
 6 files changed, 40 insertions(+), 111 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 00fa8d1bf8985f..2e800c28964ff8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,8 +33,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562))
 
-- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787))
-
 ### Changed
 
 - Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594))
diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index b9a8f2ee642aad..e24ee5bbca1cc9 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -49,8 +49,8 @@ Lightning can handle TBTT automatically via this flag.
 .. note:: If you need to modify how the batch is split,
     override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`.
 
-.. note:: Using this feature requires updating your LightningModule's
-    :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg.
+.. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include
+    a `hiddens` arg.
 
 ----------
 
@@ -59,13 +59,10 @@ Iterable Datasets
 Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural
 option when using sequential data.
 
-.. note:: When using an IterableDataset you must set the ``val_check_interval`` to 1.0 (the default) or an int
-    (specifying the number of training batches to run before validation) when initializing the Trainer. This is
-    because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation
-    interval when ``val_check_interval`` is less than one. Similarly, you can set ``limit_{mode}_batches`` to a float or
-    an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches``
-    to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception.
-    Here mode can be train/val/test.
+.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or to an int
+    (specifying the number of training batches to run before validation) when initializing the Trainer.
+    This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate
+    the validation interval when val_check_interval is less than one.
 
 .. testcode::
 
@@ -90,9 +87,3 @@ option when using sequential data.
 
     # Set val_check_interval
     trainer = Trainer(val_check_interval=100)
-
-    # Set limit_val_batches to 0.0 or 0
-    trainer = Trainer(limit_val_batches=0.0)
-
-    # Set limit_val_batches as an int
-    trainer = Trainer(limit_val_batches=100)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index c09d981d1d5e3b..80081c0dd446f6 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg
         elif self.example_input_array is not None:
             input_data = self.example_input_array
         else:
-            raise ValueError('`input_sample` and `example_input_array` tensors are both missing.')
+            raise ValueError(f'input_sample and example_input_array tensors are both missing.')
 
         if 'example_outputs' not in kwargs:
             self.eval()
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 4eec847580636f..09186765c6eeec 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -212,19 +212,18 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         # automatically add samplers
         self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True)
 
-        self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf')
         self._worker_check(self.train_dataloader, 'train dataloader')
         self._check_batch_limits('limit_train_batches')
 
-        if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
-            self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
-        elif self.num_training_batches != float('inf'):
-            self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
-        elif self.limit_train_batches != 1.0:
-            raise MisconfigurationException(
-                'When using an IterableDataset for `limit_train_batches`,'
-                ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
-                ' `num_training_batches` to use.')
+        if not _has_len(self.train_dataloader):
+            self.num_training_batches = float('inf')
+        else:
+            # try getting the length
+            if isinstance(self.limit_train_batches, float):
+                self.num_training_batches = len(self.train_dataloader)
+                self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
+            else:
+                self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches)
 
         # determine when to check validation
         # if int passed in, val checks that often
@@ -242,7 +241,8 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
                     self.val_check_batch = float('inf')
                 else:
                     raise MisconfigurationException(
-                        'When using an IterableDataset for `train_dataloader`,'
+                        'When using an infinite DataLoader (e.g. with an IterableDataset'
+                        ' or when DataLoader does not implement `__len__`) for `train_dataloader`,'
                         ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies'
                         ' checking validation every k training batches.')
             else:
@@ -304,21 +304,24 @@ def _reset_eval_dataloader(
             for i, dataloader in enumerate(dataloaders):
                 num_batches = len(dataloader) if _has_len(dataloader) else float('inf')
                 self._worker_check(dataloader, f'{mode} dataloader {i}')
-                self._check_batch_limits(f'limit_{mode}_batches')
 
                 # percent or num_steps
                 limit_eval_batches = getattr(self, f'limit_{mode}_batches')
 
-                # limit num batches either as a percent or num steps
-                if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
-                    num_batches = min(num_batches, int(limit_eval_batches))
-                elif num_batches != float('inf'):
-                    num_batches = int(num_batches * limit_eval_batches)
-                elif limit_eval_batches != 1.0:
+                if num_batches != float('inf'):
+                    self._check_batch_limits(f'limit_{mode}_batches')
+
+                    # limit num batches either as a percent or num steps
+                    if isinstance(limit_eval_batches, float):
+                        num_batches = int(num_batches * limit_eval_batches)
+                    else:
+                        num_batches = min(len(dataloader), limit_eval_batches)
+
+                elif limit_eval_batches not in (0.0, 1.0):
                     raise MisconfigurationException(
-                        'When using an IterableDataset for `limit_{mode}_batches`,'
-                        f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
-                        f' `num_{mode}_batches` to use.')
+                        'When using an infinite DataLoader (e.g. with an IterableDataset'
+                        f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,'
+                        f' `Trainer(limit_{mode}_batches)` must be `0.0` or `1.0`.')
 
                 if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                     min_pct = 1.0 / len(dataloader)
diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx_save.py
index 7cb40561f77318..f824f33c93bc14 100644
--- a/tests/models/test_onnx_save.py
+++ b/tests/models/test_onnx_save.py
@@ -84,7 +84,7 @@ def test_error_if_no_input(tmpdir):
     model = EvalModelTemplate()
     model.example_input_array = None
     file_path = os.path.join(tmpdir, "model.onxx")
-    with pytest.raises(ValueError, match=r'`input_sample` and `example_input_array` tensors are both missing'):
+    with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'):
         model.to_onnx(file_path)
 
 
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 1aad5047855a2e..1c7e21b7a72bb5 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -256,69 +256,6 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path):
         f'Multiple `test_dataloaders` not initiated properly, got {trainer.test_dataloaders}'
 
 
-@pytest.mark.parametrize(
-    ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
-    [
-        pytest.param(0.0, 0.0, 0.0),
-        pytest.param(1.0, 1.0, 1.0),
-    ]
-)
-def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches,
-                                                    limit_val_batches, limit_test_batches):
-    """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent"""
-    model = EvalModelTemplate()
-    model.train_dataloader = model.train_dataloader__infinite
-    model.val_dataloader = model.val_dataloader__infinite
-    model.test_dataloader = model.test_dataloader__infinite
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_train_batches=limit_train_batches,
-        limit_val_batches=limit_val_batches,
-        limit_test_batches=limit_test_batches,
-    )
-
-    results = trainer.fit(model)
-    assert results == 1
-    assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf')
-    assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf')
-
-    trainer.test(ckpt_path=None)
-    assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf')
-
-
-@pytest.mark.parametrize(
-    ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
-    [
-        pytest.param(0, 0, 0),
-        pytest.param(10, 10, 10),
-    ]
-)
-def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number"""
-    model = EvalModelTemplate()
-    model.train_dataloader = model.train_dataloader__infinite
-    model.val_dataloader = model.val_dataloader__infinite
-    model.test_dataloader = model.test_dataloader__infinite
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_train_batches=limit_train_batches,
-        limit_val_batches=limit_val_batches,
-        limit_test_batches=limit_test_batches,
-    )
-
-    results = trainer.fit(model)
-    assert results
-    assert trainer.num_training_batches == limit_train_batches
-    assert trainer.num_val_batches[0] == limit_val_batches
-
-    trainer.test(ckpt_path=None)
-    assert trainer.num_test_batches[0] == limit_test_batches
-
-
 @pytest.mark.parametrize(
     ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
     [
@@ -329,7 +266,7 @@ def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, lim
     ]
 )
 def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify num_batches for train, val & test dataloaders passed with batch limit in percent"""
+    """Verify num_batches for val & test dataloaders passed with batch limit in percent"""
     model = EvalModelTemplate()
     model.val_dataloader = model.val_dataloader__multiple_mixed_length
     model.test_dataloader = model.test_dataloader__multiple_mixed_length
@@ -370,7 +307,7 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim
     ]
 )
 def test_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify num_batches for train, val & test dataloaders passed with batch limit as number"""
+    """Verify num_batches for val & test dataloaders passed with batch limit as number"""
     os.environ['PL_DEV_DEBUG'] = '1'
 
     model = EvalModelTemplate()
@@ -499,7 +436,7 @@ def test_train_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5)
 
-    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
+    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
         trainer.fit(model)
 
 
@@ -510,7 +447,7 @@ def test_val_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
+    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
         trainer.fit(model)
 
 
@@ -521,7 +458,7 @@ def test_test_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
+    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
         trainer.test(model)
 
 
@@ -837,7 +774,7 @@ def test_train_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, val_check_interval=0.5)
 
-    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
+    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
         trainer.fit(model)
 
 
@@ -848,7 +785,7 @@ def test_val_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_val_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
+    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
         trainer.fit(model)
 
 
@@ -859,5 +796,5 @@ def test_test_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_test_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
+    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
         trainer.test(model)

From 633cf76c686357c88f2d6397fa316ed710004184 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 5 Aug 2020 15:58:27 -0400
Subject: [PATCH 10/39] Update __init__.py

---
 pytorch_lightning/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 6b948f5e07528d..f3bde1c7ee5738 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '0.9.0rc7'
+__version__ = '0.9.0rc8'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From a5f2b89ed08172c25fd1cdc3d884d0fbb60bc45c Mon Sep 17 00:00:00 2001
From: Ananya Harsh Jha <ananya@pytorchlightning.ai>
Date: Wed, 5 Aug 2020 19:12:11 -0400
Subject: [PATCH 11/39] updated sync bn (#2838)

* updated sync bn

* updated sync bn

* updated sync bn

* updated sync bn

* updated sync bn

* updated sync bn

* updated sync bn

* updated sync bn

* added ddp_spawn test

* updated test

* clean

* clean

Co-authored-by: Jirka Borovec <jirka@pytorchlightning.ai>
---
 pl_examples/basic_examples/sync_bn.py         | 204 ------------------
 pl_examples/test_examples.py                  |  14 --
 .../accelerator_backends/ddp_backend.py       |   4 +-
 .../accelerator_backends/ddp_spawn_backend.py |   4 +-
 pytorch_lightning/core/lightning.py           |   2 +-
 pytorch_lightning/trainer/__init__.py         |   8 +
 pytorch_lightning/trainer/trainer.py          |   6 +-
 tests/base/datamodules.py                     |  48 ++++-
 tests/models/test_sync_batchnorm.py           | 100 +++++++++
 9 files changed, 163 insertions(+), 227 deletions(-)
 delete mode 100644 pl_examples/basic_examples/sync_bn.py
 create mode 100644 tests/models/test_sync_batchnorm.py

diff --git a/pl_examples/basic_examples/sync_bn.py b/pl_examples/basic_examples/sync_bn.py
deleted file mode 100644
index bb602a74ea89c9..00000000000000
--- a/pl_examples/basic_examples/sync_bn.py
+++ /dev/null
@@ -1,204 +0,0 @@
-"""
-Sync-bn with DDP (GPU)
-
-This code is to verify that batch statistics are synchronized across GPUs using sync-bn.
-When sync_bn is set to True the training loop should run for 3 iterations.
-When sync_bn is set to False, the code should result in an AssertionError.
-"""
-import os
-import math
-import numpy as np
-from argparse import ArgumentParser
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import pytorch_lightning as pl
-
-import torchvision.transforms as transforms
-from torchvision.datasets import MNIST
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.distributed import DistributedSampler
-
-
-pl.seed_everything(234)
-FLOAT16_EPSILON = np.finfo(np.float16).eps
-
-
-class MNISTDataModule(pl.LightningDataModule):
-    def __init__(self, data_dir: str = './', batch_size=32, dist_sampler=False):
-        super().__init__()
-
-        self.dist_sampler = dist_sampler
-        self.data_dir = data_dir
-        self.batch_size = batch_size
-
-        self.transforms = transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize((0.1307,), (0.3081,))
-        ])
-
-        # self.dims is returned when you call dm.size()
-        # Setting default dims here because we know them.
-        # Could optionally be assigned dynamically in dm.setup()
-        self.dims = (1, 28, 28)
-
-    def prepare_data(self):
-        # download only
-        MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())
-        MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
-
-    def setup(self, stage=None):
-
-        # Assign train/val datasets for use in dataloaders
-        if stage == 'fit' or stage is None:
-            self.mnist_train = MNIST(self.data_dir, train=True, transform=self.transforms)
-
-        # Assign test dataset for use in dataloader(s)
-        if stage == 'test' or stage is None:
-            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transforms)
-
-    def train_dataloader(self):
-        dist_sampler = None
-        if self.dist_sampler:
-            dist_sampler = DistributedSampler(self.mnist_train, shuffle=False)
-
-        return DataLoader(
-            self.mnist_train, batch_size=self.batch_size, sampler=dist_sampler, shuffle=False
-        )
-
-    def test_dataloader(self):
-        return DataLoader(self.mnist_test, batch_size=self.batch_size, shuffle=False)
-
-
-class SyncBNModule(pl.LightningModule):
-    def __init__(self, gpu_count=1, **kwargs):
-        super().__init__()
-
-        self.gpu_count = gpu_count
-        self.bn_targets = None
-        if 'bn_targets' in kwargs:
-            self.bn_targets = kwargs['bn_targets']
-
-        self.linear = nn.Linear(28 * 28, 10)
-        self.bn_layer = nn.BatchNorm1d(28 * 28)
-
-    def forward(self, x, batch_idx):
-        with torch.no_grad():
-            out_bn = self.bn_layer(x.view(x.size(0), -1))
-
-            if self.bn_targets:
-                bn_target = self.bn_targets[batch_idx]
-
-                # executes on both GPUs
-                bn_target = bn_target[self.trainer.local_rank::self.gpu_count]
-                bn_target = bn_target.to(out_bn.device)
-                assert torch.sum(torch.abs(bn_target - out_bn)) < FLOAT16_EPSILON
-
-        out = self.linear(out_bn)
-
-        return out, out_bn
-
-    def training_step(self, batch, batch_idx):
-        x, y = batch
-
-        y_hat, _ = self(x, batch_idx)
-        loss = F.cross_entropy(y_hat, y)
-
-        return pl.TrainResult(loss)
-
-    def configure_optimizers(self):
-        return torch.optim.Adam(self.linear.parameters(), lr=0.02)
-
-    @staticmethod
-    def add_model_specific_argument(parent_parser, root_dir):
-        """
-        Define parameters that only apply to this model
-        """
-        parser = ArgumentParser(parents=[parent_parser])
-
-        parser.add_argument('--nodes', default=1, type=int)
-        parser.add_argument('--gpu', default=2, type=int)
-        parser.add_argument('--dist_backend', default='ddp', type=str)
-
-        parser.add_argument('--epochs', default=1, type=int)
-        parser.add_argument('--steps', default=3, type=int)
-
-        parser.add_argument('--bn_sync', action='store_true')
-
-        return parser
-
-
-def main(args, datamodule, bn_outputs):
-    """Main training routine specific for this project."""
-    # ------------------------
-    # 1 INIT LIGHTNING MODEL
-    # ------------------------
-    model = SyncBNModule(gpu_count=args.gpu, bn_targets=bn_outputs)
-
-    # ------------------------
-    # 2 INIT TRAINER
-    # ------------------------
-    trainer = pl.Trainer(
-        gpus=args.gpu,
-        num_nodes=args.nodes,
-        distributed_backend=args.dist_backend,
-        max_epochs=args.epochs,
-        max_steps=args.steps,
-        sync_bn=args.bn_sync,
-        num_sanity_val_steps=0,
-        replace_sampler_ddp=False,
-    )
-
-    # ------------------------
-    # 3 START TRAINING
-    # ------------------------
-    trainer.fit(model, datamodule)
-
-
-def run_cli():
-    root_dir = os.path.dirname(os.path.realpath(__file__))
-    parent_parser = ArgumentParser(add_help=False)
-
-    # define datamodule and dataloader
-    dm = MNISTDataModule()
-    dm.prepare_data()
-    dm.setup(stage=None)
-
-    train_dataloader = dm.train_dataloader()
-    model = SyncBNModule()
-
-    bn_outputs = []
-
-    # shuffle is false by default
-    for batch_idx, batch in enumerate(train_dataloader):
-        x, y = batch
-
-        out, out_bn = model.forward(x, batch_idx)
-        bn_outputs.append(out_bn)
-
-        # get 3 steps
-        if batch_idx == 2:
-            break
-
-    bn_outputs = [x.cuda() for x in bn_outputs]
-
-    # reset datamodule
-    # batch-size = 16 because 2 GPUs in DDP
-    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
-    dm.prepare_data()
-    dm.setup(stage=None)
-
-    # each LightningModule defines arguments relevant to it
-    parser = SyncBNModule.add_model_specific_argument(parent_parser, root_dir=root_dir)
-    parser = pl.Trainer.add_argparse_args(parser)
-    args = parser.parse_args()
-
-    # ---------------------
-    # RUN TRAINING
-    # ---------------------
-    main(args, dm, bn_outputs)
-
-
-if __name__ == '__main__':
-    run_cli()
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
index d527354647f075..330135e8ea78ab 100644
--- a/pl_examples/test_examples.py
+++ b/pl_examples/test_examples.py
@@ -25,20 +25,6 @@ def test_gpu_template(cli_args):
         run_cli()
 
 
-@pytest.mark.parametrize(
-    'cli_args',
-    ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2 --dist_backend ddp_spawn --bn_sync']
-)
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-def test_sync_bn(cli_args):
-    """Test running CLI for an example with sync bn."""
-    from pl_examples.basic_examples.sync_bn import run_cli
-
-    cli_args = cli_args.split(' ') if cli_args else []
-    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-        run_cli()
-
-
 # @pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --num_nodes 1 --gpus 2'])
 # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 # def test_multi_node_ddp(cli_args):
diff --git a/pytorch_lightning/accelerator_backends/ddp_backend.py b/pytorch_lightning/accelerator_backends/ddp_backend.py
index c2e549c18ef1a9..44ad52d34ba2f9 100644
--- a/pytorch_lightning/accelerator_backends/ddp_backend.py
+++ b/pytorch_lightning/accelerator_backends/ddp_backend.py
@@ -177,8 +177,8 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         self.trainer.optimizer_frequencies = optimizer_frequencies
 
         # call sync_bn before .cuda(), configure_apex and configure_ddp
-        if self.trainer.sync_bn:
-            model = model.configure_sync_bn(model)
+        if self.trainer.sync_batchnorm:
+            model = model.configure_sync_batchnorm(model)
 
         # MODEL
         # copy model to each gpu
diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
index 85de2d1b7759ec..704fc5558588a4 100644
--- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
+++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
@@ -119,8 +119,8 @@ def ddp_train(self, process_idx, mp_queue, model):
         self.trainer.optimizer_frequencies = optimizer_frequencies
 
         # call sync_bn before .cuda(), configure_apex and configure_ddp
-        if self.trainer.sync_bn:
-            model = model.configure_sync_bn(model)
+        if self.trainer.sync_batchnorm:
+            model = model.configure_sync_batchnorm(model)
 
         # MODEL
         # copy model to each gpu
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 80081c0dd446f6..d272c23fd9a659 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -957,7 +957,7 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi
         log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}")
         torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
 
-    def configure_sync_bn(self, model: 'LightningModule') -> 'LightningModule':
+    def configure_sync_batchnorm(self, model: 'LightningModule') -> 'LightningModule':
         """
         Add global batchnorm for a model spread across multiple GPUs and nodes.
 
diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py
index 0164210c771fbd..8dcec8eb305110 100644
--- a/pytorch_lightning/trainer/__init__.py
+++ b/pytorch_lightning/trainer/__init__.py
@@ -855,6 +855,14 @@ def on_train_end(self, trainer, pl_module):
     # default used by the Trainer
     trainer = Trainer(row_log_interval=50)
 
+sync_batchnorm
+^^^^^^^^^^^^^^
+
+Enable synchronization between batchnorm layers across all GPUs.
+
+.. testcode::
+
+    trainer = Trainer(sync_batchnorm=True)
 
 val_percent_check
 ^^^^^^^^^^^^^^^^^
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ebfe680fc3372b..4b342328df2979 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -184,7 +184,7 @@ def __init__(
         log_save_interval: int = 100,
         row_log_interval: int = 50,
         distributed_backend: Optional[str] = None,
-        sync_bn: bool = False,
+        sync_batchnorm: bool = False,
         precision: int = 32,
         weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT,
         weights_save_path: Optional[str] = None,
@@ -297,7 +297,7 @@ def __init__(
 
             distributed_backend: The distributed backend to use (dp, ddp, ddp2, ddp_spawn, ddp_cpu)
 
-            sync_bn: Synchronize batch norm layers between process groups/whole world.
+            sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 
             precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs.
 
@@ -431,7 +431,7 @@ def __init__(
         self.log_gpu_memory = log_gpu_memory
 
         # sync-bn backend
-        self.sync_bn = sync_bn
+        self.sync_batchnorm = sync_batchnorm
 
         self.gradient_clip_val = gradient_clip_val
         self.check_val_every_n_epoch = check_val_every_n_epoch
diff --git a/tests/base/datamodules.py b/tests/base/datamodules.py
index a55a9a718ea9d2..d1f7fabf8d6b46 100644
--- a/tests/base/datamodules.py
+++ b/tests/base/datamodules.py
@@ -1,7 +1,9 @@
+import os
 from torch.utils.data import random_split, DataLoader
 
 from pytorch_lightning.core.datamodule import LightningDataModule
-from tests.base.datasets import TrialMNIST
+from tests.base.datasets import TrialMNIST, MNIST
+from torch.utils.data.distributed import DistributedSampler
 
 
 class TrialMNISTDataModule(LightningDataModule):
@@ -36,3 +38,47 @@ def val_dataloader(self):
 
     def test_dataloader(self):
         return DataLoader(self.mnist_test, batch_size=32)
+
+
+class MNISTDataModule(LightningDataModule):
+    def __init__(
+        self, data_dir: str = './', batch_size: int = 32, dist_sampler: bool = False
+    ) -> None:
+        super().__init__()
+
+        self.dist_sampler = dist_sampler
+        self.data_dir = data_dir
+        self.batch_size = batch_size
+
+        # self.dims is returned when you call dm.size()
+        # Setting default dims here because we know them.
+        # Could optionally be assigned dynamically in dm.setup()
+        self.dims = (1, 28, 28)
+
+    def prepare_data(self):
+        # download only
+        MNIST(self.data_dir, train=True, download=True, normalize=(0.1307, 0.3081))
+        MNIST(self.data_dir, train=False, download=True, normalize=(0.1307, 0.3081))
+
+    def setup(self, stage: str = None):
+
+        # Assign train/val datasets for use in dataloaders
+        # TODO: need to split using random_split once updated to torch >= 1.6
+        if stage == 'fit' or stage is None:
+            self.mnist_train = MNIST(self.data_dir, train=True, normalize=(0.1307, 0.3081))
+
+        # Assign test dataset for use in dataloader(s)
+        if stage == 'test' or stage is None:
+            self.mnist_test = MNIST(self.data_dir, train=False, normalize=(0.1307, 0.3081))
+
+    def train_dataloader(self):
+        dist_sampler = None
+        if self.dist_sampler:
+            dist_sampler = DistributedSampler(self.mnist_train, shuffle=False)
+
+        return DataLoader(
+            self.mnist_train, batch_size=self.batch_size, sampler=dist_sampler, shuffle=False
+        )
+
+    def test_dataloader(self):
+        return DataLoader(self.mnist_test, batch_size=self.batch_size, shuffle=False)
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
new file mode 100644
index 00000000000000..5aff30d0aacbd9
--- /dev/null
+++ b/tests/models/test_sync_batchnorm.py
@@ -0,0 +1,100 @@
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from pytorch_lightning import Trainer, seed_everything, LightningModule, TrainResult
+from pytorch_lightning.utilities import FLOAT16_EPSILON
+from tests.base.datamodules import MNISTDataModule
+from tests.base.develop_utils import set_random_master_port
+
+
+class SyncBNModule(LightningModule):
+    def __init__(self, gpu_count=1, **kwargs):
+        super().__init__()
+
+        self.gpu_count = gpu_count
+        self.bn_targets = None
+        if 'bn_targets' in kwargs:
+            self.bn_targets = kwargs['bn_targets']
+
+        self.linear = nn.Linear(28 * 28, 10)
+        self.bn_layer = nn.BatchNorm1d(28 * 28)
+
+    def forward(self, x, batch_idx):
+        with torch.no_grad():
+            out_bn = self.bn_layer(x.view(x.size(0), -1))
+
+            if self.bn_targets:
+                bn_target = self.bn_targets[batch_idx]
+
+                # executes on both GPUs
+                bn_target = bn_target[self.trainer.local_rank::self.gpu_count]
+                bn_target = bn_target.to(out_bn.device)
+                assert torch.sum(torch.abs(bn_target - out_bn)) < FLOAT16_EPSILON
+
+        out = self.linear(out_bn)
+
+        return out, out_bn
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+
+        y_hat, _ = self(x, batch_idx)
+        loss = F.cross_entropy(y_hat, y)
+
+        return TrainResult(loss)
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.linear.parameters(), lr=0.02)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_sync_batchnorm_ddp(tmpdir):
+    seed_everything(234)
+    set_random_master_port()
+
+    # define datamodule and dataloader
+    dm = MNISTDataModule()
+    dm.prepare_data()
+    dm.setup(stage=None)
+
+    train_dataloader = dm.train_dataloader()
+    model = SyncBNModule()
+
+    bn_outputs = []
+
+    # shuffle is false by default
+    for batch_idx, batch in enumerate(train_dataloader):
+        x, _ = batch
+
+        _, out_bn = model.forward(x, batch_idx)
+        bn_outputs.append(out_bn)
+
+        # get 3 steps
+        if batch_idx == 2:
+            break
+
+    bn_outputs = [x.cuda() for x in bn_outputs]
+
+    # reset datamodule
+    # batch-size = 16 because 2 GPUs in DDP
+    dm = MNISTDataModule(batch_size=16, dist_sampler=True)
+    dm.prepare_data()
+    dm.setup(stage=None)
+
+    model = SyncBNModule(gpu_count=2, bn_targets=bn_outputs)
+
+    trainer = Trainer(
+        gpus=2,
+        num_nodes=1,
+        distributed_backend='ddp_spawn',
+        max_epochs=1,
+        max_steps=3,
+        sync_batchnorm=True,
+        num_sanity_val_steps=0,
+        replace_sampler_ddp=False,
+    )
+
+    result = trainer.fit(model, dm)
+    assert result == 1, "Sync batchnorm failing with DDP"

From b507c42c478e7b99acb37f1fe9a0bf72285a1b17 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 5 Aug 2020 20:01:30 -0400
Subject: [PATCH 12/39] clarify batch hooks (#2842)

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook

* modified hook
---
 pytorch_lightning/callbacks/base.py        |  8 ++++++++
 pytorch_lightning/callbacks/lr_logger.py   |  2 +-
 pytorch_lightning/callbacks/progress.py    | 10 +++++-----
 pytorch_lightning/core/hooks.py            | 21 +++++++++++++++++++++
 pytorch_lightning/core/lightning.py        |  2 +-
 pytorch_lightning/trainer/callback_hook.py | 12 +++++++++++-
 pytorch_lightning/trainer/lr_finder.py     |  2 +-
 pytorch_lightning/trainer/training_loop.py | 19 +++++++++++++++++++
 tests/callbacks/test_callbacks.py          | 16 ++++++++++++++++
 tests/callbacks/test_progress_bar.py       |  8 ++++----
 tests/core/test_datamodules.py             |  6 +++---
 tests/loggers/test_all.py                  |  2 +-
 tests/trainer/test_trainer.py              |  4 ++--
 tests/utilities/test_dtype_device_mixin.py |  2 +-
 14 files changed, 94 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py
index a9c6e1fb520cb2..82a0e6b0436a65 100644
--- a/pytorch_lightning/callbacks/base.py
+++ b/pytorch_lightning/callbacks/base.py
@@ -46,6 +46,14 @@ def on_sanity_check_end(self, trainer, pl_module):
         """Called when the validation sanity check ends."""
         pass
 
+    def on_train_batch_start(self, trainer, pl_module):
+        """Called when the validation batch begins."""
+        pass
+
+    def on_train_batch_end(self, trainer, pl_module):
+        """Called when the validation batch ends."""
+        pass
+
     def on_train_epoch_start(self, trainer, pl_module):
         """Called when the train epoch begins."""
         pass
diff --git a/pytorch_lightning/callbacks/lr_logger.py b/pytorch_lightning/callbacks/lr_logger.py
index 87953d496b3ad9..7ec73b8c888119 100755
--- a/pytorch_lightning/callbacks/lr_logger.py
+++ b/pytorch_lightning/callbacks/lr_logger.py
@@ -64,7 +64,7 @@ def on_train_start(self, trainer, pl_module):
         # Initialize for storing values
         self.lrs = {name: [] for name in names}
 
-    def on_batch_start(self, trainer, pl_module):
+    def on_train_batch_start(self, trainer, pl_module):
         latest_stat = self._extract_lr(trainer, 'step')
         if trainer.logger and latest_stat:
             trainer.logger.log_metrics(latest_stat, step=trainer.global_step)
diff --git a/pytorch_lightning/callbacks/progress.py b/pytorch_lightning/callbacks/progress.py
index 0acdbcc7509eab..4ab990f74724e8 100644
--- a/pytorch_lightning/callbacks/progress.py
+++ b/pytorch_lightning/callbacks/progress.py
@@ -36,8 +36,8 @@ def __init__(self):
             def disable(self):
                 self.enable = False
 
-            def on_batch_end(self, trainer, pl_module):
-                super().on_batch_end(trainer, pl_module)  # don't forget this :)
+            def on_train_batch_end(self, trainer, pl_module):
+                super().on_train_batch_end(trainer, pl_module)  # don't forget this :)
                 percent = (self.train_batch_idx / self.total_train_batches) * 100
                 sys.stdout.flush()
                 sys.stdout.write(f'{percent:.01f} percent complete \r')
@@ -138,7 +138,7 @@ def on_train_start(self, trainer, pl_module):
     def on_epoch_start(self, trainer, pl_module):
         self._train_batch_idx = 0
 
-    def on_batch_end(self, trainer, pl_module):
+    def on_train_batch_end(self, trainer, pl_module):
         self._train_batch_idx += 1
 
     def on_validation_start(self, trainer, pl_module):
@@ -318,8 +318,8 @@ def on_epoch_start(self, trainer, pl_module):
             self.main_progress_bar.reset(convert_inf(total_batches))
         self.main_progress_bar.set_description(f'Epoch {trainer.current_epoch + 1}')
 
-    def on_batch_end(self, trainer, pl_module):
-        super().on_batch_end(trainer, pl_module)
+    def on_train_batch_end(self, trainer, pl_module):
+        super().on_train_batch_end(trainer, pl_module)
         if self.is_enabled and self.train_batch_idx % self.refresh_rate == 0:
             self.main_progress_bar.update(self.refresh_rate)
             self.main_progress_bar.set_postfix(trainer.progress_bar_dict)
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 8c6b726ac31d27..1218dcbe6760fd 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -77,6 +77,23 @@ def on_train_end(self) -> None:
         """
         # do something at the end of training
 
+    def on_train_batch_start(self, batch: Any) -> None:
+        """
+        Called in the training loop before anything happens for that batch.
+
+        If you return -1 here, you will skip training for the rest of the current epoch.
+
+        Args:
+            batch: The batched data as it is returned by the training DataLoader.
+        """
+        # do something when the batch starts
+
+    def on_train_batch_end(self) -> None:
+        """
+        Called in the training loop after the batch.
+        """
+        # do something when the batch end
+
     def on_batch_start(self, batch: Any) -> None:
         """
         Called in the training loop before anything happens for that batch.
@@ -85,12 +102,16 @@ def on_batch_start(self, batch: Any) -> None:
 
         Args:
             batch: The batched data as it is returned by the training DataLoader.
+
+        .. warning:: Deprecated in 0.9.0 will remove 1.0.0 (use `on_train_batch_start` instead)
         """
         # do something when the batch starts
 
     def on_batch_end(self) -> None:
         """
         Called in the training loop after the batch.
+
+        .. warning:: Deprecated in 0.9.0 will remove 1.0.0 (use `on_train_batch_end` instead)
         """
         # do something when the batch ends
 
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index d272c23fd9a659..f816726ddf1e17 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg
         elif self.example_input_array is not None:
             input_data = self.example_input_array
         else:
-            raise ValueError(f'input_sample and example_input_array tensors are both missing.')
+            raise ValueError('input_sample and example_input_array tensors are both missing.')
 
         if 'example_outputs' not in kwargs:
             self.eval()
diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py
index 89b5e712c91909..7c627434553172 100644
--- a/pytorch_lightning/trainer/callback_hook.py
+++ b/pytorch_lightning/trainer/callback_hook.py
@@ -9,7 +9,7 @@ class TrainerCallbackHookMixin(ABC):
     # this is just a summary on variables used in this abstract class,
     # the proper values/initialisation should be done in child class
     callbacks: List[Callback] = []
-    get_model: Callable = ...
+    get_model: Callable
 
     def setup(self, stage: str):
         """Called in the beginning of fit and test"""
@@ -111,6 +111,16 @@ def on_batch_end(self):
         for callback in self.callbacks:
             callback.on_batch_end(self, self.get_model())
 
+    def on_train_batch_start(self):
+        """Called when the training batch begins."""
+        for callback in self.callbacks:
+            callback.on_train_batch_start(self, self.get_model())
+
+    def on_train_batch_end(self):
+        """Called when the training batch ends."""
+        for callback in self.callbacks:
+            callback.on_train_batch_end(self, self.get_model())
+
     def on_validation_batch_start(self):
         """Called when the validation batch begins."""
         for callback in self.callbacks:
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index 3b2778d24071c4..23ad702956e848 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -382,7 +382,7 @@ def on_batch_start(self, trainer, pl_module):
 
         self.lrs.append(trainer.lr_schedulers[0]['scheduler'].lr[0])
 
-    def on_batch_end(self, trainer, pl_module):
+    def on_train_batch_end(self, trainer, pl_module):
         """ Called when the training batch ends, logs the calculated loss """
         if (trainer.batch_idx + 1) % trainer.accumulate_grad_batches != 0:
             return
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index c8cb81ed090b19..993e8ccd53fd08 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -263,6 +263,8 @@ class TrainerTrainLoopMixin(ABC):
     on_train_end: Callable
     on_batch_start: Callable
     on_batch_end: Callable
+    on_train_batch_start: Callable
+    on_train_batch_end: Callable
     on_epoch_start: Callable
     on_epoch_end: Callable
     on_validation_end: Callable
@@ -690,6 +692,7 @@ def run_training_batch(self, batch, batch_idx):
             return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic)
 
         # Batch start events
+        # TODO: deprecate 1.0
         with self.profiler.profile('on_batch_start'):
             # callbacks
             self.on_batch_start()
@@ -699,6 +702,15 @@ def run_training_batch(self, batch, batch_idx):
                 if response == -1:
                     return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
 
+        with self.profiler.profile('on_train_batch_start'):
+            # callbacks
+            self.on_train_batch_start()
+            # hooks
+            if self.is_function_implemented('on_train_batch_start'):
+                response = self.get_model().on_train_batch_start(batch)
+                if response == -1:
+                    return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic)
+
         splits = [batch]
         if self.truncated_bptt_steps is not None:
             model_ref = self.get_model()
@@ -785,6 +797,13 @@ def run_training_batch(self, batch, batch_idx):
             if self.is_function_implemented('on_batch_end'):
                 self.get_model().on_batch_end()
 
+        with self.profiler.profile('on_train_batch_end'):
+            # callbacks
+            self.on_train_batch_end()
+            # model hooks
+            if self.is_function_implemented('on_train_batch_end'):
+                self.get_model().on_train_batch_end()
+
         # collapse all metrics into one dict
         batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()}
 
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index d10965524394b2..83de82c71de679 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -28,6 +28,8 @@ def __init__(self):
             self.on_epoch_end_called = False
             self.on_batch_start_called = False
             self.on_batch_end_called = False
+            self.on_train_batch_start_called = False
+            self.on_train_batch_end_called = False
             self.on_validation_batch_start_called = False
             self.on_validation_batch_end_called = False
             self.on_test_batch_start_called = False
@@ -87,6 +89,14 @@ def on_batch_end(self, trainer, pl_module):
             _check_args(trainer, pl_module)
             self.on_batch_end_called = True
 
+        def on_train_batch_start(self, trainer, pl_module):
+            _check_args(trainer, pl_module)
+            self.on_train_batch_start_called = True
+
+        def on_train_batch_end(self, trainer, pl_module):
+            _check_args(trainer, pl_module)
+            self.on_train_batch_end_called = True
+
         def on_validation_batch_start(self, trainer, pl_module):
             _check_args(trainer, pl_module)
             self.on_validation_batch_start_called = True
@@ -150,6 +160,8 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_epoch_start_called
     assert not test_callback.on_batch_start_called
     assert not test_callback.on_batch_end_called
+    assert not test_callback.on_train_batch_start_called
+    assert not test_callback.on_train_batch_end_called
     assert not test_callback.on_validation_batch_start_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_test_batch_start_called
@@ -177,6 +189,8 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_epoch_start_called
     assert not test_callback.on_batch_start_called
     assert not test_callback.on_batch_end_called
+    assert not test_callback.on_train_batch_start_called
+    assert not test_callback.on_train_batch_end_called
     assert not test_callback.on_validation_batch_start_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_test_batch_start_called
@@ -202,6 +216,8 @@ def on_test_end(self, trainer, pl_module):
     assert test_callback.on_epoch_start_called
     assert test_callback.on_batch_start_called
     assert test_callback.on_batch_end_called
+    assert test_callback.on_train_batch_start_called
+    assert test_callback.on_train_batch_end_called
     assert test_callback.on_validation_batch_start_called
     assert test_callback.on_validation_batch_end_called
     assert test_callback.on_train_start_called
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index 23743dc5dcb2cc..779077c437585c 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -153,12 +153,12 @@ class CurrentProgressBar(ProgressBar):
         val_batches_seen = 0
         test_batches_seen = 0
 
-        def on_batch_start(self, trainer, pl_module):
-            super().on_batch_start(trainer, pl_module)
+        def on_train_batch_start(self, trainer, pl_module):
+            super().on_train_batch_start(trainer, pl_module)
             assert self.train_batch_idx == trainer.batch_idx
 
-        def on_batch_end(self, trainer, pl_module):
-            super().on_batch_end(trainer, pl_module)
+        def on_train_batch_end(self, trainer, pl_module):
+            super().on_train_batch_end(trainer, pl_module)
             assert self.train_batch_idx == trainer.batch_idx + 1
             if not self.is_disabled and self.train_batch_idx % self.refresh_rate == 0:
                 assert self.main_progress_bar.n == self.train_batch_idx
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index ec66afb71ca22b..305f7f3d69150e 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -50,17 +50,17 @@ def test_can_prepare_data(tmpdir):
 
     # is_overridden prepare data = True
     # has been called
-        # False
+    # False
     dm._has_prepared_data = True
     assert not trainer.can_prepare_data()
 
     # has not been called
-        # True
+    # True
     dm._has_prepared_data = False
     assert trainer.can_prepare_data()
 
     # is_overridden prepare data = False
-            # True
+    # True
     dm.prepare_data = None
     assert trainer.can_prepare_data()
 
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 3afa1dd11c56c1..5bd81d7116948d 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -214,7 +214,7 @@ class RankZeroLoggerCheck(Callback):
     # this class has to be defined outside the test function, otherwise we get pickle error
     # due to the way ddp process is launched
 
-    def on_batch_start(self, trainer, pl_module):
+    def on_train_batch_start(self, trainer, pl_module):
         is_dummy = isinstance(trainer.logger.experiment, DummyExperiment)
         if trainer.is_global_zero:
             assert not is_dummy
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index c7652ebecf3f9a..3dbb7b7c079d64 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -377,7 +377,7 @@ def increment_on_load_checkpoint(self, _):
         # Bind methods to keep track of epoch numbers, batch numbers it has seen
         # as well as number of times it has called on_load_checkpoint()
         model.on_epoch_end = types.MethodType(increment_epoch, model)
-        model.on_batch_start = types.MethodType(increment_batch, model)
+        model.on_train_batch_start = types.MethodType(increment_batch, model)
         model.on_load_checkpoint = types.MethodType(increment_on_load_checkpoint, model)
         return model
 
@@ -691,7 +691,7 @@ class InterruptCallback(Callback):
         def __init__(self):
             super().__init__()
 
-        def on_batch_start(self, trainer, pl_module):
+        def on_train_batch_start(self, trainer, pl_module):
             raise KeyboardInterrupt
 
     class HandleInterruptCallback(Callback):
diff --git a/tests/utilities/test_dtype_device_mixin.py b/tests/utilities/test_dtype_device_mixin.py
index f755cf5c634ed5..08f808bda9ceb0 100644
--- a/tests/utilities/test_dtype_device_mixin.py
+++ b/tests/utilities/test_dtype_device_mixin.py
@@ -27,7 +27,7 @@ def __init__(self, *args, **kwargs):
 
 class DeviceAssertCallback(Callback):
 
-    def on_batch_start(self, trainer, model):
+    def on_train_batch_start(self, trainer, model):
         rank = trainer.local_rank
         assert isinstance(model, TopModule)
         # index = None also means first device

From fe29c53ab5eb16758ccc448716e1c365da5c1beb Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Thu, 6 Aug 2020 02:42:09 +0200
Subject: [PATCH 13/39] add ddp sync for logging in result step (#2822)

* add ddp sync for logging in result step

* pep8

* pep8

* make ddp tests run also on cpu (except windowws)

* create class instance in ddp test

* revert automated formatting

* pep8
---
 pytorch_lightning/core/step_result.py | 33 +++++++++++++++++++++---
 tests/core/test_results.py            | 37 +++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 4 deletions(-)
 create mode 100644 tests/core/test_results.py

diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 253ccedabc5d79..172930fd4ad9a8 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -1,7 +1,9 @@
+import numbers
 from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any
 from torch import Tensor
 import torch
 from copy import copy
+from pytorch_lightning.metrics.converters import _sync_ddp_if_available
 
 
 class Result(Dict):
@@ -89,11 +91,18 @@ def log(
             on_epoch: bool = True,
             reduce_fx: Callable = torch.mean,
             enable_graph: bool = False,
+            sync_ddp: bool = False,
+            sync_ddp_op: Union[Any, str] = 'mean',
+            sync_ddp_group: Optional[Any] = None
     ):
         # no metrics should be logged with graphs
         if not enable_graph and isinstance(value, torch.Tensor):
             value = value.detach()
 
+        # sync across ddp
+        if sync_ddp and isinstance(value, (torch.Tensor, numbers.Number)):
+            value = _sync_ddp_if_available(value, group=sync_ddp_group, reduce_op=sync_ddp_op)
+
         if 'meta' not in self:
             self.__setitem__('meta', {})
 
@@ -338,6 +347,9 @@ def log(
             on_epoch: bool = False,
             reduce_fx: Callable = torch.mean,
             enable_graph: bool = False,
+            sync_ddp: bool = False,
+            sync_ddp_op: Union[Any, str] = 'mean',
+            sync_ddp_group: Optional[Any] = None
     ):
         """
         Log a key, value
@@ -369,7 +381,8 @@ def log(
             reduce_fx: Torch.mean by default
             enable_graph: if True, will not auto detach the graph
         """
-        super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph)
+        super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph,
+                    sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op)
 
     def log_dict(
             self,
@@ -380,6 +393,9 @@ def log_dict(
             on_epoch: bool = True,
             reduce_fx: Callable = torch.mean,
             enable_graph: bool = False,
+            sync_ddp: bool = False,
+            sync_ddp_op: Union[Any, str] = 'mean',
+            sync_ddp_group: Optional[Any] = None
     ):
         """
         Log a dictonary of values at once
@@ -399,7 +415,8 @@ def log_dict(
             enable_graph:
         """
         for k, v in dictionary.items():
-            self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph)
+            self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph,
+                     sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op)
 
 
 class EvalResult(Result):
@@ -446,6 +463,9 @@ def log(
             on_epoch: bool = True,
             reduce_fx: Callable = torch.mean,
             enable_graph: bool = False,
+            sync_ddp: bool = False,
+            sync_ddp_op: Union[Any, str] = 'mean',
+            sync_ddp_group: Optional[Any] = None
     ):
         """
         Log a key, value
@@ -476,7 +496,8 @@ def log(
             reduce_fx: Torch.mean by default
             enable_graph: if True, will not auto detach the graph :
         """
-        super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph)
+        super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph,
+                    sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op)
 
     def log_dict(
             self,
@@ -487,6 +508,9 @@ def log_dict(
             on_epoch: bool = True,
             reduce_fx: Callable = torch.mean,
             enable_graph: bool = False,
+            sync_ddp: bool = False,
+            sync_ddp_op: Union[Any, str] = 'mean',
+            sync_ddp_group: Optional[Any] = None
     ):
         """
         Log a dictonary of values at once
@@ -506,7 +530,8 @@ def log_dict(
             enable_graph:
         """
         for k, v in dictionary.items():
-            self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph)
+            self.log(k, v, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph,
+                     sync_ddp=sync_ddp, sync_ddp_group=sync_ddp_group, sync_ddp_op=sync_ddp_op)
 
     def get_callback_metrics(self) -> dict:
         result = {
diff --git a/tests/core/test_results.py b/tests/core/test_results.py
new file mode 100644
index 00000000000000..743a6d89153436
--- /dev/null
+++ b/tests/core/test_results.py
@@ -0,0 +1,37 @@
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult
+import tests.base.develop_utils as tutils
+import sys
+
+
+def _setup_ddp(rank, worldsize):
+    import os
+
+    os.environ["MASTER_ADDR"] = "localhost"
+
+    # initialize the process group
+    dist.init_process_group("gloo", rank=rank, world_size=worldsize)
+
+
+def _ddp_test_fn(rank, worldsize, result_cls: Result):
+    _setup_ddp(rank, worldsize)
+    tensor = torch.tensor([1.0])
+
+    res = result_cls()
+    res.log("test_tensor", tensor, sync_ddp=True, sync_ddp_op=torch.distributed.ReduceOp.SUM)
+
+    assert res["test_tensor"].item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors"
+
+
+@pytest.mark.parametrize("result_cls", [Result, TrainResult, EvalResult])
+@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
+def test_result_reduce_ddp(result_cls):
+    """Make sure result logging works with DDP"""
+    tutils.reset_seed()
+    tutils.set_random_master_port()
+
+    worldsize = 2
+    mp.spawn(_ddp_test_fn, args=(worldsize, result_cls), nprocs=worldsize)

From dd78be516aafe89890065ecc4c24b0303dba2712 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Wed, 5 Aug 2020 20:45:11 -0400
Subject: [PATCH 14/39] Update __init__.py

---
 pytorch_lightning/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index f3bde1c7ee5738..3a1d0fa8474580 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
 """Root package info."""
 
-__version__ = '0.9.0rc8'
+__version__ = '0.9.0rc9'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

From ac4a21507105e2c29c114e93d3cc49485ef34de2 Mon Sep 17 00:00:00 2001
From: Younghun Roh <9127047+Diuven@users.noreply.github.com>
Date: Thu, 6 Aug 2020 18:40:35 +0900
Subject: [PATCH 15/39] Faster Accuracy metric (#2775)

* Faster classfication stats

* Faster accuracy metric

* minor change on cls metric

* Add out-of-bound class clamping

* Add more tests and minor fixes

* Resolve code style warning

* Update for #2781

* hotfix

* Update pytorch_lightning/metrics/functional/classification.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update about conversation

* Add docstring on stat_scores_multiple_classes

Co-authored-by: Younghun Roh <yhunroh@mindslab.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .../metrics/functional/classification.py      | 81 ++++++++++++++-----
 .../metrics/functional/test_classification.py | 16 ++--
 2 files changed, 73 insertions(+), 24 deletions(-)

diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py
index 0ed308dff87aac..d12509d5885299 100644
--- a/pytorch_lightning/metrics/functional/classification.py
+++ b/pytorch_lightning/metrics/functional/classification.py
@@ -138,10 +138,10 @@ def stat_scores_multiple_classes(
         target: torch.Tensor,
         num_classes: Optional[int] = None,
         argmax_dim: int = 1,
+        reduction: str = 'none',
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Calls the stat_scores function iteratively for all classes, thus
-    calculating the number of true postive, false postive, true negative
+    Calculates the number of true postive, false postive, true negative
     and false negative for each class
 
     Args:
@@ -150,6 +150,12 @@ def stat_scores_multiple_classes(
         num_classes: number of classes if known
         argmax_dim: if pred is a tensor of probabilities, this indicates the
             axis the argmax transformation will be applied over
+        reduction: method for reducing result values (default: none)
+            Available reduction methods:
+
+            - elementwise_mean: takes the mean
+            - none: pass array
+            - sum: add elements
 
     Return:
         True Positive, False Positive, True Negative, False Negative, Support
@@ -173,16 +179,58 @@ def stat_scores_multiple_classes(
     if pred.ndim == target.ndim + 1:
         pred = to_categorical(pred, argmax_dim=argmax_dim)
 
-    num_classes = get_num_classes(pred=pred, target=target,
-                                  num_classes=num_classes)
+    num_classes = get_num_classes(pred=pred, target=target, num_classes=num_classes)
 
-    tps = torch.zeros((num_classes,), device=pred.device)
-    fps = torch.zeros((num_classes,), device=pred.device)
-    tns = torch.zeros((num_classes,), device=pred.device)
-    fns = torch.zeros((num_classes,), device=pred.device)
-    sups = torch.zeros((num_classes,), device=pred.device)
-    for c in range(num_classes):
-        tps[c], fps[c], tns[c], fns[c], sups[c] = stat_scores(pred=pred, target=target, class_index=c)
+    if pred.dtype != torch.bool:
+        pred.clamp_max_(max=num_classes)
+    if target.dtype != torch.bool:
+        target.clamp_max_(max=num_classes)
+
+    possible_reductions = ('none', 'sum', 'elementwise_mean')
+    if reduction not in possible_reductions:
+        raise ValueError("reduction type %s not supported" % reduction)
+
+    if reduction == 'none':
+        pred = pred.view((-1, )).long()
+        target = target.view((-1, )).long()
+
+        tps = torch.zeros((num_classes + 1,), device=pred.device)
+        fps = torch.zeros((num_classes + 1,), device=pred.device)
+        tns = torch.zeros((num_classes + 1,), device=pred.device)
+        fns = torch.zeros((num_classes + 1,), device=pred.device)
+        sups = torch.zeros((num_classes + 1,), device=pred.device)
+
+        match_true = (pred == target).float()
+        match_false = 1 - match_true
+
+        tps.scatter_add_(0, pred, match_true)
+        fps.scatter_add_(0, pred, match_false)
+        fns.scatter_add_(0, target, match_false)
+        tns = pred.size(0) - (tps + fps + fns)
+        sups.scatter_add_(0, target, torch.ones_like(match_true))
+
+        tps = tps[:num_classes]
+        fps = fps[:num_classes]
+        tns = tns[:num_classes]
+        fns = fns[:num_classes]
+        sups = sups[:num_classes]
+
+    elif reduction == 'sum' or reduction == 'elementwise_mean':
+        count_match_true = (pred == target).sum().float()
+        oob_tp, oob_fp, oob_tn, oob_fn, oob_sup = stat_scores(pred, target, num_classes, argmax_dim)
+
+        tps = count_match_true - oob_tp
+        fps = pred.nelement() - count_match_true - oob_fp
+        fns = pred.nelement() - count_match_true - oob_fn
+        tns = pred.nelement() * (num_classes + 1) - (tps + fps + fns + oob_tn)
+        sups = pred.nelement() - oob_sup.float()
+
+        if reduction == 'elementwise_mean':
+            tps /= num_classes
+            fps /= num_classes
+            fns /= num_classes
+            tns /= num_classes
+            sups /= num_classes
 
     return tps, fps, tns, fns, sups
 
@@ -218,16 +266,13 @@ def accuracy(
         tensor(0.7500)
 
     """
-    tps, fps, tns, fns, sups = stat_scores_multiple_classes(
-        pred=pred, target=target, num_classes=num_classes)
-
     if not (target > 0).any() and num_classes is None:
         raise RuntimeError("cannot infer num_classes when target is all zero")
 
-    if reduction in ('elementwise_mean', 'sum'):
-        return reduce(sum(tps) / sum(sups), reduction=reduction)
-    if reduction == 'none':
-        return reduce(tps / sups, reduction=reduction)
+    tps, fps, tns, fns, sups = stat_scores_multiple_classes(
+        pred=pred, target=target, num_classes=num_classes, reduction=reduction)
+
+    return tps / sups
 
 
 def confusion_matrix(
diff --git a/tests/metrics/functional/test_classification.py b/tests/metrics/functional/test_classification.py
index c9e1f0892f6e7f..bc2c5cb34354af 100644
--- a/tests/metrics/functional/test_classification.py
+++ b/tests/metrics/functional/test_classification.py
@@ -121,15 +121,19 @@ def test_stat_scores(pred, target, expected_tp, expected_fp, expected_tn, expect
     assert sup.item() == expected_support
 
 
-@pytest.mark.parametrize(['pred', 'target', 'expected_tp', 'expected_fp',
+@pytest.mark.parametrize(['pred', 'target', 'reduction', 'expected_tp', 'expected_fp',
                           'expected_tn', 'expected_fn', 'expected_support'], [
-    pytest.param(torch.tensor([0., 2., 4., 4.]), torch.tensor([0., 4., 3., 4.]),
+    pytest.param(torch.tensor([0., 2., 4., 4.]), torch.tensor([0., 4., 3., 4.]), 'none',
+                 [1, 0, 0, 0, 1], [0, 0, 1, 0, 1], [3, 4, 3, 3, 1], [0, 0, 0, 1, 1], [1, 0, 0, 1, 2]),
+    pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), 'none',
                  [1, 0, 0, 0, 1], [0, 0, 1, 0, 1], [3, 4, 3, 3, 1], [0, 0, 0, 1, 1], [1, 0, 0, 1, 2]),
-    pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]),
-                 [1, 0, 0, 0, 1], [0, 0, 1, 0, 1], [3, 4, 3, 3, 1], [0, 0, 0, 1, 1], [1, 0, 0, 1, 2])
+    pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), 'sum',
+                 torch.tensor(2), torch.tensor(2), torch.tensor(14), torch.tensor(2), torch.tensor(4)),
+    pytest.param(to_onehot(torch.tensor([0., 2., 4., 4.])), torch.tensor([0., 4., 3., 4.]), 'elementwise_mean',
+                 torch.tensor(0.4), torch.tensor(0.4), torch.tensor(2.8), torch.tensor(0.4), torch.tensor(0.8))
 ])
-def test_stat_scores_multiclass(pred, target, expected_tp, expected_fp, expected_tn, expected_fn, expected_support):
-    tp, fp, tn, fn, sup = stat_scores_multiple_classes(pred, target)
+def test_stat_scores_multiclass(pred, target, reduction, expected_tp, expected_fp, expected_tn, expected_fn, expected_support):
+    tp, fp, tn, fn, sup = stat_scores_multiple_classes(pred, target, reduction=reduction)
 
     assert torch.allclose(torch.tensor(expected_tp).to(tp), tp)
     assert torch.allclose(torch.tensor(expected_fp).to(fp), fp)

From 767c44950c37cb67c5864ab2b92d22da74f6608b Mon Sep 17 00:00:00 2001
From: xmotli02 <9034262+xmotli02@users.noreply.github.com>
Date: Thu, 6 Aug 2020 12:08:25 +0200
Subject: [PATCH 16/39] Added basic file logger (#2721)

* Added basic file logger #1803

* fixup! Added basic file logger #1803

* fixup! Added basic file logger #1803

* fixup! Added basic file logger #1803

* fixup! Added basic file logger #1803

* fixup! Added basic file logger #1803

* csv

* Apply suggestions from code review

* tests

* tests

* tests

* miss

* docs

Co-authored-by: xmotli02 <xmotli02@users.noreply.github.com>
Co-authored-by: Jirka Borovec <jirka@pytorchlightning.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md                          |   2 +
 docs/source/loggers.rst               |   6 +
 pytorch_lightning/core/saving.py      |   2 +-
 pytorch_lightning/loggers/__init__.py |   3 +
 pytorch_lightning/loggers/csv_logs.py | 204 ++++++++++++++++++++++++++
 tests/loggers/test_all.py             |   7 +
 tests/loggers/test_csv.py             |  97 ++++++++++++
 7 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 pytorch_lightning/loggers/csv_logs.py
 create mode 100644 tests/loggers/test_csv.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e800c28964ff8..bf8d002bce0e8b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added SyncBN for DDP ([#2801](https://github.com/PyTorchLightning/pytorch-lightning/pull/2801))
 
+- Added basic `CSVLogger` ([#2721](https://github.com/PyTorchLightning/pytorch-lightning/pull/2721))
+
 - Added SSIM metrics ([#2671](https://github.com/PyTorchLightning/pytorch-lightning/pull/2671))
 
 - Added BLEU metrics ([#2535](https://github.com/PyTorchLightning/pytorch-lightning/pull/2535))
diff --git a/docs/source/loggers.rst b/docs/source/loggers.rst
index 1877e9f3eff5a8..e04ba1af5ca1cc 100644
--- a/docs/source/loggers.rst
+++ b/docs/source/loggers.rst
@@ -339,4 +339,10 @@ Test-tube
 ^^^^^^^^^
 
 .. autoclass:: pytorch_lightning.loggers.test_tube.TestTubeLogger
+    :noindex:
+
+CSVLogger
+^^^^^^^^^
+
+.. autoclass:: pytorch_lightning.loggers.csv_logs.CSVLogger
     :noindex:
\ No newline at end of file
diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py
index 5e3ef1d97236d7..37c63de1804ab1 100644
--- a/pytorch_lightning/core/saving.py
+++ b/pytorch_lightning/core/saving.py
@@ -313,7 +313,7 @@ def load_hparams_from_yaml(config_yaml: str) -> Dict[str, Any]:
         return {}
 
     with open(config_yaml) as fp:
-        tags = yaml.load(fp, Loader=yaml.SafeLoader)
+        tags = yaml.load(fp)
 
     return tags
 
diff --git a/pytorch_lightning/loggers/__init__.py b/pytorch_lightning/loggers/__init__.py
index daa2b99bb80c6f..5f2f3044d0a65d 100644
--- a/pytorch_lightning/loggers/__init__.py
+++ b/pytorch_lightning/loggers/__init__.py
@@ -2,11 +2,14 @@
 
 from pytorch_lightning.loggers.base import LightningLoggerBase, LoggerCollection
 from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
+from pytorch_lightning.loggers.csv_logs import CSVLogger
+
 
 __all__ = [
     'LightningLoggerBase',
     'LoggerCollection',
     'TensorBoardLogger',
+    'CSVLogger',
 ]
 
 try:
diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py
new file mode 100644
index 00000000000000..1e395abadb2937
--- /dev/null
+++ b/pytorch_lightning/loggers/csv_logs.py
@@ -0,0 +1,204 @@
+"""
+CSV logger
+----------
+
+CSV logger for basic experiment logging that does not require opening ports
+
+"""
+import io
+import os
+import csv
+import torch
+from argparse import Namespace
+from typing import Optional, Dict, Any, Union
+
+from pytorch_lightning import _logger as log
+from pytorch_lightning.core.saving import save_hparams_to_yaml
+from pytorch_lightning.loggers.base import LightningLoggerBase
+from pytorch_lightning.utilities.distributed import rank_zero_warn, rank_zero_only
+
+
+class ExperimentWriter(object):
+    r"""
+    Experiment writer for CSVLogger.
+
+    Currently supports to log hyperparameters and metrics in YAML and CSV
+    format, respectively.
+
+    Args:
+        log_dir: Directory for the experiment logs
+    """
+
+    NAME_HPARAMS_FILE = 'hparams.yaml'
+    NAME_METRICS_FILE = 'metrics.csv'
+
+    def __init__(self, log_dir: str) -> None:
+        self.hparams = {}
+        self.metrics = []
+
+        self.log_dir = log_dir
+        if os.path.exists(self.log_dir):
+            rank_zero_warn(
+                f"Experiment logs directory {self.log_dir} exists and is not empty."
+                " Previous log files in this directory will be deleted when the new ones are saved!"
+            )
+        os.makedirs(self.log_dir, exist_ok=True)
+
+        self.metrics_file_path = os.path.join(self.log_dir, self.NAME_METRICS_FILE)
+
+    def log_hparams(self, params: Dict[str, Any]) -> None:
+        """Record hparams"""
+        self.hparams.update(params)
+
+    def log_metrics(self, metrics_dict: Dict[str, float], step: Optional[int] = None) -> None:
+        """Record metrics"""
+        def _handle_value(value):
+            if isinstance(value, torch.Tensor):
+                return value.item()
+            return value
+
+        if step is None:
+            step = len(self.metrics)
+
+        metrics = {k: _handle_value(v) for k, v in metrics_dict.items()}
+        metrics['step'] = step
+        self.metrics.append(metrics)
+
+    def save(self) -> None:
+        """Save recorded hparams and metrics into files"""
+        hparams_file = os.path.join(self.log_dir, self.NAME_HPARAMS_FILE)
+        save_hparams_to_yaml(hparams_file, self.hparams)
+
+        if not self.metrics:
+            return
+
+        last_m = {}
+        for m in self.metrics:
+            last_m.update(m)
+        metrics_keys = list(last_m.keys())
+
+        with io.open(self.metrics_file_path, 'w', newline='') as f:
+            self.writer = csv.DictWriter(f, fieldnames=metrics_keys)
+            self.writer.writeheader()
+            self.writer.writerows(self.metrics)
+
+
+class CSVLogger(LightningLoggerBase):
+    r"""
+    Log to local file system in yaml and CSV format. Logs are saved to
+    ``os.path.join(save_dir, name, version)``.
+
+    Example:
+        >>> from pytorch_lightning import Trainer
+        >>> from pytorch_lightning.loggers import CSVLogger
+        >>> logger = CSVLogger("logs", name="my_exp_name")
+        >>> trainer = Trainer(logger=logger)
+
+    Args:
+        save_dir: Save directory
+        name: Experiment name. Defaults to ``'default'``.
+        version: Experiment version. If version is not specified the logger inspects the save
+            directory for existing versions, then automatically assigns the next available version.
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 name: Optional[str] = "default",
+                 version: Optional[Union[int, str]] = None):
+
+        super().__init__()
+        self._save_dir = save_dir
+        self._name = name or ''
+        self._version = version
+        self._experiment = None
+
+    @property
+    def root_dir(self) -> str:
+        """
+        Parent directory for all checkpoint subdirectories.
+        If the experiment name parameter is ``None`` or the empty string, no experiment subdirectory is used
+        and the checkpoint will be saved in "save_dir/version_dir"
+        """
+        if not self.name:
+            return self.save_dir
+        return os.path.join(self.save_dir, self.name)
+
+    @property
+    def log_dir(self) -> str:
+        """
+        The log directory for this run. By default, it is named
+        ``'version_${self.version}'`` but it can be overridden by passing a string value
+        for the constructor's version parameter instead of ``None`` or an int.
+        """
+        # create a pseudo standard path ala test-tube
+        version = self.version if isinstance(self.version, str) else f"version_{self.version}"
+        log_dir = os.path.join(self.root_dir, version)
+        return log_dir
+
+    @property
+    def save_dir(self) -> Optional[str]:
+        return self._save_dir
+
+    @property
+    def experiment(self) -> ExperimentWriter:
+        r"""
+
+        Actual ExperimentWriter object. To use ExperimentWriter features in your
+        :class:`~pytorch_lightning.core.lightning.LightningModule` do the following.
+
+        Example::
+
+            self.logger.experiment.some_experiment_writer_function()
+
+        """
+        if self._experiment:
+            return self._experiment
+
+        os.makedirs(self.root_dir, exist_ok=True)
+        self._experiment = ExperimentWriter(log_dir=self.log_dir)
+        return self._experiment
+
+    @rank_zero_only
+    def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
+        params = self._convert_params(params)
+        self.experiment.log_hparams(params)
+
+    @rank_zero_only
+    def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None:
+        self.experiment.log_metrics(metrics, step)
+
+    @rank_zero_only
+    def save(self) -> None:
+        super().save()
+        self.experiment.save()
+
+    @rank_zero_only
+    def finalize(self, status: str) -> None:
+        self.save()
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def version(self) -> int:
+        if self._version is None:
+            self._version = self._get_next_version()
+        return self._version
+
+    def _get_next_version(self):
+        root_dir = os.path.join(self._save_dir, self.name)
+
+        if not os.path.isdir(root_dir):
+            log.warning('Missing logger folder: %s', root_dir)
+            return 0
+
+        existing_versions = []
+        for d in os.listdir(root_dir):
+            if os.path.isdir(os.path.join(root_dir, d)) and d.startswith("version_"):
+                existing_versions.append(int(d.split("_")[1]))
+
+        if len(existing_versions) == 0:
+            return 0
+
+        return max(existing_versions) + 1
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 5bd81d7116948d..7978aa8e41acec 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -5,11 +5,13 @@
 import platform
 from unittest import mock
 
+import cloudpickle
 import pytest
 
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer, Callback
 from pytorch_lightning.loggers import (
+    CSVLogger,
     TensorBoardLogger,
     MLFlowLogger,
     NeptuneLogger,
@@ -34,6 +36,7 @@ def _get_logger_args(logger_class, save_dir):
 
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
+    CSVLogger,
     CometLogger,
     MLFlowLogger,
     NeptuneLogger,
@@ -85,6 +88,7 @@ def log_metrics(self, metrics, step):
 
 
 @pytest.mark.parametrize("logger_class", [
+    CSVLogger,
     TensorBoardLogger,
     CometLogger,
     MLFlowLogger,
@@ -148,6 +152,7 @@ def name(self):
 
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
+    CSVLogger,
     CometLogger,
     MLFlowLogger,
     NeptuneLogger,
@@ -170,6 +175,7 @@ def test_loggers_pickle(tmpdir, monkeypatch, logger_class):
 
     # test pickling loggers
     pickle.dumps(logger)
+    cloudpickle.dumps(logger)
 
     trainer = Trainer(
         max_epochs=1,
@@ -226,6 +232,7 @@ def on_train_batch_start(self, trainer, pl_module):
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
+    # CSVLogger,  # todo
     CometLogger,
     MLFlowLogger,
     NeptuneLogger,
diff --git a/tests/loggers/test_csv.py b/tests/loggers/test_csv.py
new file mode 100644
index 00000000000000..3bc8330075e6a1
--- /dev/null
+++ b/tests/loggers/test_csv.py
@@ -0,0 +1,97 @@
+from argparse import Namespace
+
+import pytest
+import torch
+import os
+
+from pytorch_lightning.core.saving import load_hparams_from_yaml
+from pytorch_lightning.loggers import CSVLogger
+from pytorch_lightning.loggers.csv_logs import ExperimentWriter
+
+
+def test_file_logger_automatic_versioning(tmpdir):
+    """Verify that automatic versioning works"""
+
+    root_dir = tmpdir.mkdir("exp")
+    root_dir.mkdir("version_0")
+    root_dir.mkdir("version_1")
+
+    logger = CSVLogger(save_dir=tmpdir, name="exp")
+
+    assert logger.version == 2
+
+
+def test_file_logger_manual_versioning(tmpdir):
+    """Verify that manual versioning works"""
+
+    root_dir = tmpdir.mkdir("exp")
+    root_dir.mkdir("version_0")
+    root_dir.mkdir("version_1")
+    root_dir.mkdir("version_2")
+
+    logger = CSVLogger(save_dir=tmpdir, name="exp", version=1)
+
+    assert logger.version == 1
+
+
+def test_file_logger_named_version(tmpdir):
+    """Verify that manual versioning works for string versions, e.g. '2020-02-05-162402' """
+
+    exp_name = "exp"
+    tmpdir.mkdir(exp_name)
+    expected_version = "2020-02-05-162402"
+
+    logger = CSVLogger(save_dir=tmpdir, name=exp_name, version=expected_version)
+    logger.log_hyperparams({"a": 1, "b": 2})
+    logger.save()
+    assert logger.version == expected_version
+    assert os.listdir(tmpdir / exp_name) == [expected_version]
+    assert os.listdir(tmpdir / exp_name / expected_version)
+
+
+@pytest.mark.parametrize("name", ['', None])
+def test_file_logger_no_name(tmpdir, name):
+    """Verify that None or empty name works"""
+    logger = CSVLogger(save_dir=tmpdir, name=name)
+    logger.save()
+    assert logger.root_dir == tmpdir
+    assert os.listdir(tmpdir / 'version_0')
+
+
+@pytest.mark.parametrize("step_idx", [10, None])
+def test_file_logger_log_metrics(tmpdir, step_idx):
+    logger = CSVLogger(tmpdir)
+    metrics = {
+        "float": 0.3,
+        "int": 1,
+        "FloatTensor": torch.tensor(0.1),
+        "IntTensor": torch.tensor(1)
+    }
+    logger.log_metrics(metrics, step_idx)
+    logger.save()
+
+    path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE)
+    with open(path_csv, 'r') as fp:
+        lines = fp.readlines()
+    assert len(lines) == 2
+    assert all([n in lines[0] for n in metrics])
+
+
+def test_file_logger_log_hyperparams(tmpdir):
+    logger = CSVLogger(tmpdir)
+    hparams = {
+        "float": 0.3,
+        "int": 1,
+        "string": "abc",
+        "bool": True,
+        "dict": {'a': {'b': 'c'}},
+        "list": [1, 2, 3],
+        "namespace": Namespace(foo=Namespace(bar='buzz')),
+        "layer": torch.nn.BatchNorm1d
+    }
+    logger.log_hyperparams(hparams)
+    logger.save()
+
+    path_yaml = os.path.join(logger.log_dir, ExperimentWriter.NAME_HPARAMS_FILE)
+    params = load_hparams_from_yaml(path_yaml)
+    assert all([n in params for n in hparams])

From 9b997c8616d5e94446fe48eee20daa3b9f93b424 Mon Sep 17 00:00:00 2001
From: s-rog <55400948+s-rog@users.noreply.github.com>
Date: Thu, 6 Aug 2020 19:11:43 +0800
Subject: [PATCH 17/39] add test for none checkpoint in ddp_spawn (#2845)

* add test for none checkpoint in ddp_spawn

* fix code style

* make sure checkpoint_callback is none

* Fix tests

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
---
 tests/trainer/test_trainer.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 3dbb7b7c079d64..d6641c2f7ab249 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -988,3 +988,17 @@ def setup(self, stage):
     trainer.test(ckpt_path=None)
     assert trainer.stage == 'test'
     assert trainer.get_model().stage == 'test'
+
+
+def test_trainer_ddp_spawn_none_checkpoint(tmpdir):
+    model = EvalModelTemplate()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        checkpoint_callback=None,
+        distributed_backend="ddp_spawn"
+    )
+    assert trainer.checkpoint_callback is None
+    result = trainer.fit(model)
+    assert trainer.checkpoint_callback is None
+    assert result == 1

From 9ab071588bbe0e24441ae0f07a271bded0e4d9c3 Mon Sep 17 00:00:00 2001
From: Nathan Raw <nxr9266@g.rit.edu>
Date: Thu, 6 Aug 2020 05:12:47 -0600
Subject: [PATCH 18/39] Setup extras (#2831)

* :art: use package extras

* :art: get extras from reqs

* :art: .

* :pencil: docs

* :art: .
---
 .github/CONTRIBUTING.md |  5 ++---
 setup.py                | 26 +++++++++++++++++++++-----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index d39b0bd112c77a..9e1faafc63ee3f 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -137,7 +137,7 @@ formatting errors. In certain cases, a missing blank line or a wrong indent can
 Run these commands
 
 ```bash
-pip install -r requirements/docs.txt
+pip install ".[docs]"
 cd docs
 make html
 ```
@@ -159,8 +159,7 @@ Testing your work locally will help you speed up the process since it allows you
 To setup a local development environment, install both local and test dependencies:
 
 ```bash
-python -m pip install -r requirements/devel.txt
-python -m pip install -r requirements/examples.txt
+python -m pip install ".[dev, examples]"
 python -m pip install pre-commit
 ```
 
diff --git a/setup.py b/setup.py
index 0246b3cd5d3d9f..ded68282faa43b 100755
--- a/setup.py
+++ b/setup.py
@@ -12,21 +12,23 @@
 
 # https://packaging.python.org/guides/single-sourcing-package-version/
 # http://blog.ionelmc.ro/2014/05/25/python-packaging/
-
 PATH_ROOT = os.path.dirname(__file__)
 builtins.__LIGHTNING_SETUP__ = True
 
 import pytorch_lightning  # noqa: E402
 
 
-def load_requirements(path_dir=PATH_ROOT, comment_char='#'):
-    with open(os.path.join(path_dir, 'requirements', 'base.txt'), 'r') as file:
+def load_requirements(path_dir=PATH_ROOT, file_name='base.txt', comment_char='#'):
+    with open(os.path.join(path_dir, 'requirements', file_name), 'r') as file:
         lines = [ln.strip() for ln in file.readlines()]
     reqs = []
     for ln in lines:
         # filer all comments
         if comment_char in ln:
-            ln = ln[:ln.index(comment_char)]
+            ln = ln[:ln.index(comment_char)].strip()
+        # Make slight syntax alteration to git dependency for PL's sphinx theme
+        if ln.startswith('git') and file_name == 'docs.txt':
+            ln = f'pt_lightning_sphinx_theme @ {ln}#egg=pt-lightning-sphinx-theme'
         if ln:  # if requirement is not empty
             reqs.append(ln)
     return reqs
@@ -43,6 +45,19 @@ def load_long_description():
     return text
 
 
+# https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-extras
+# Define package extras. These are only installed if you specify them.
+# From remote, use like `pip install pytorch-lightning[dev, docs]`
+# From local copy of repo, use like `pip install ".[dev, docs]"`
+extras = {
+    'docs': load_requirements(file_name='docs.txt'),
+    'examples': load_requirements(file_name='examples.txt'),
+    'extra': load_requirements(file_name='extra.txt'),
+    'test': load_requirements(file_name='test.txt')
+}
+extras['dev'] = extras['extra'] + extras['test']
+extras['all'] = extras['dev'] + extras['examples'] + extras['docs']
+
 # https://packaging.python.org/discussions/install-requires-vs-requirements /
 # keep the meta-data here for simplicity in reading this file... it's not obvious
 # what happens and to non-engineers they won't know to look in init ...
@@ -67,7 +82,8 @@ def load_long_description():
     keywords=['deep learning', 'pytorch', 'AI'],
     python_requires='>=3.6',
     setup_requires=[],
-    install_requires=load_requirements(PATH_ROOT),
+    install_requires=load_requirements(),
+    extras_require=extras,
 
     project_urls={
         "Bug Tracker": "https://github.com/PyTorchLightning/pytorch-lightning/issues",

From ed3ee982b33db395a1fb41865d8ca3be39379b7e Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 6 Aug 2020 16:58:51 +0200
Subject: [PATCH 19/39] clean tests imports (#2834)

---
 tests/base/datasets.py                               | 2 +-
 tests/base/deterministic_model.py                    | 3 +--
 tests/base/develop_utils.py                          | 2 +-
 tests/base/model_train_steps.py                      | 3 ++-
 tests/callbacks/test_early_stopping.py               | 2 +-
 tests/core/test_datamodules.py                       | 2 +-
 tests/loggers/test_tensorboard.py                    | 1 -
 tests/metrics/functional/test_regression.py          | 2 +-
 tests/metrics/test_converters.py                     | 3 ++-
 tests/metrics/test_regression.py                     | 1 -
 tests/models/test_amp.py                             | 4 ++--
 tests/models/test_gpu.py                             | 3 ++-
 tests/models/test_grad_norm.py                       | 3 ++-
 tests/models/test_horovod.py                         | 2 --
 tests/models/{test_onnx_save.py => test_onnx.py}     | 3 ++-
 tests/trainer/test_config_validator.py               | 2 +-
 tests/trainer/test_trainer_steps_result_return.py    | 8 +++++---
 tests/trainer/test_trainer_steps_scalar_return.py    | 3 ++-
 tests/trainer/test_validation_steps_result_return.py | 7 ++++---
 19 files changed, 30 insertions(+), 26 deletions(-)
 rename tests/models/{test_onnx_save.py => test_onnx.py} (99%)

diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index 27e614eee68cfc..5bc6048a43c65f 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -1,8 +1,8 @@
 import logging
 import os
 import random
-import urllib.request
 import time
+import urllib.request
 from typing import Tuple, Optional, Sequence
 
 import torch
diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index 75f5cad3b39cf9..676720e789e89a 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -1,9 +1,8 @@
-import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import Dataset, DataLoader
-from pytorch_lightning import TrainResult, EvalResult
 
+from pytorch_lightning import TrainResult, EvalResult
 from pytorch_lightning.core.lightning import LightningModule
 
 
diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index ada745951494cc..37fde1d8723c08 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -1,3 +1,4 @@
+import functools
 import os
 
 import numpy as np
@@ -8,7 +9,6 @@
 from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
 from tests import TEMP_PATH, RANDOM_PORTS, RANDOM_SEEDS
 from tests.base.model_template import EvalModelTemplate
-import functools
 
 
 def assert_speed_parity_relative(pl_times, pt_times, max_diff: float = 0.1):
diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py
index 189e496564da1d..c361c3692ad17c 100644
--- a/tests/base/model_train_steps.py
+++ b/tests/base/model_train_steps.py
@@ -1,10 +1,11 @@
 import math
 from abc import ABC
 from collections import OrderedDict
-from pytorch_lightning import TrainResult, EvalResult
 
 import torch
 
+from pytorch_lightning import TrainResult, EvalResult
+
 
 class TrainingStepVariations(ABC):
     """
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 17ca3bb2210f33..eb5a400655b20b 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -2,8 +2,8 @@
 
 import cloudpickle
 import pytest
-
 import torch
+
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from tests.base import EvalModelTemplate
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index 305f7f3d69150e..fd4d3c082e0be9 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -1,8 +1,8 @@
 import pickle
 from argparse import ArgumentParser
 
-import torch
 import pytest
+import torch
 
 from pytorch_lightning import Trainer
 from tests.base import EvalModelTemplate
diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py
index 21c58084e2df40..e5aec716a25071 100644
--- a/tests/loggers/test_tensorboard.py
+++ b/tests/loggers/test_tensorboard.py
@@ -5,7 +5,6 @@
 import torch
 import yaml
 from packaging import version
-from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import TensorBoardLogger
diff --git a/tests/metrics/functional/test_regression.py b/tests/metrics/functional/test_regression.py
index cd251c77a98fc1..628e37ec78e955 100644
--- a/tests/metrics/functional/test_regression.py
+++ b/tests/metrics/functional/test_regression.py
@@ -1,6 +1,6 @@
+import numpy as np
 import pytest
 import torch
-import numpy as np
 from skimage.metrics import peak_signal_noise_ratio as ski_psnr
 from skimage.metrics import structural_similarity as ski_ssim
 
diff --git a/tests/metrics/test_converters.py b/tests/metrics/test_converters.py
index 60eb8cc48ec7a0..1d1412dd8171a0 100644
--- a/tests/metrics/test_converters.py
+++ b/tests/metrics/test_converters.py
@@ -1,6 +1,7 @@
+import sys
+
 import numpy as np
 import pytest
-import sys
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
diff --git a/tests/metrics/test_regression.py b/tests/metrics/test_regression.py
index e5ecd51c775b90..36c408e93c4695 100644
--- a/tests/metrics/test_regression.py
+++ b/tests/metrics/test_regression.py
@@ -3,7 +3,6 @@
 #   Especially reduction and reducing across processes won't be tested here!
 
 import torch
-from skimage.metrics import peak_signal_noise_ratio as ski_psnr
 
 from pytorch_lightning.metrics.regression import (
     MAE, MSE, RMSE, RMSLE, PSNR, SSIM
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 5f734fb3bdcf60..7f729c0c1fa5d4 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -1,15 +1,15 @@
 import os
+from unittest.mock import MagicMock
 
 import pytest
 import torch
+import wandb
 
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
-import wandb
-from unittest.mock import MagicMock
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 6752e559632cba..7497a53083612e 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -2,6 +2,7 @@
 
 import pytest
 import torch
+from torchtext.data import Batch, Dataset, Example, Field, LabelField
 
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
@@ -10,7 +11,7 @@
 from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
-from torchtext.data import Batch, Dataset, Example, Field, LabelField
+
 PRETEND_N_OF_GPUS = 16
 
 
diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py
index dc7eee557d4484..2e0d4454500f39 100644
--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@@ -1,6 +1,7 @@
+import os
+
 import numpy as np
 import pytest
-import os
 
 from pytorch_lightning import Trainer
 from tests.base import EvalModelTemplate
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 05aa2d7f29bc3b..f48db196c104aa 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -4,10 +4,8 @@
 import shlex
 import subprocess
 import sys
-
 from unittest.mock import patch
 
-import numpy as np
 import pytest
 import torch
 
diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx.py
similarity index 99%
rename from tests/models/test_onnx_save.py
rename to tests/models/test_onnx.py
index f824f33c93bc14..d7cc7cffaec3f7 100644
--- a/tests/models/test_onnx_save.py
+++ b/tests/models/test_onnx.py
@@ -1,9 +1,10 @@
 import os
 
+import numpy as np
 import onnxruntime
 import pytest
 import torch
-import numpy as np
+
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
diff --git a/tests/trainer/test_config_validator.py b/tests/trainer/test_config_validator.py
index 4b31c7d09dba35..8db122037b51c9 100755
--- a/tests/trainer/test_config_validator.py
+++ b/tests/trainer/test_config_validator.py
@@ -1,7 +1,7 @@
 import pytest
 
 import tests.base.develop_utils as tutils
-from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
 
diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py
index af1f582bdf7097..1785fea3c0afee 100644
--- a/tests/trainer/test_trainer_steps_result_return.py
+++ b/tests/trainer/test_trainer_steps_result_return.py
@@ -2,12 +2,14 @@
 Tests to ensure that the training loop works with a dict
 """
 import os
+
+import pytest
 import torch
+
 from pytorch_lightning import Trainer
-from tests.base.deterministic_model import DeterministicModel
-from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult
+from pytorch_lightning.core.step_result import TrainResult
 from tests.base import EvalModelTemplate
-import pytest
+from tests.base.deterministic_model import DeterministicModel
 
 
 # test with train_step_end
diff --git a/tests/trainer/test_trainer_steps_scalar_return.py b/tests/trainer/test_trainer_steps_scalar_return.py
index b893b58310dc37..e5eb1e9bccf5f1 100644
--- a/tests/trainer/test_trainer_steps_scalar_return.py
+++ b/tests/trainer/test_trainer_steps_scalar_return.py
@@ -1,9 +1,10 @@
 """
 Tests to ensure that the training loop works with a scalar
 """
+import torch
+
 from pytorch_lightning import Trainer
 from tests.base.deterministic_model import DeterministicModel
-import torch
 
 
 def test_training_step_scalar(tmpdir):
diff --git a/tests/trainer/test_validation_steps_result_return.py b/tests/trainer/test_validation_steps_result_return.py
index 118e420adbd8ca..8162f57287e71e 100644
--- a/tests/trainer/test_validation_steps_result_return.py
+++ b/tests/trainer/test_validation_steps_result_return.py
@@ -2,12 +2,13 @@
 Tests to ensure that the training loop works with a dict
 """
 import os
+
+import pytest
 import torch
+
 from pytorch_lightning import Trainer
-from tests.base.deterministic_model import DeterministicModel
-from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult
 from tests.base import EvalModelTemplate
-import pytest
+from tests.base.deterministic_model import DeterministicModel
 
 
 # test with train_step_end

From a829f15f8c9e2a3fdcdea3c59f54ba2879842622 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Sat, 1 Aug 2020 16:32:41 +0530
Subject: [PATCH 20/39] Support limit_mode_batches(int) for infinite dataloader

---
 pytorch_lightning/trainer/data_loading.py | 39 +++++++++++------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 09186765c6eeec..143a132f12f98a 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -212,18 +212,20 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         # automatically add samplers
         self.train_dataloader = self.auto_add_sampler(self.train_dataloader, train=True)
 
+        self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf')
         self._worker_check(self.train_dataloader, 'train dataloader')
         self._check_batch_limits('limit_train_batches')
 
-        if not _has_len(self.train_dataloader):
-            self.num_training_batches = float('inf')
+        if isinstance(self.limit_train_batches, int):
+            self.num_training_batches = min(self.num_training_batches, self.limit_train_batches)
         else:
-            # try getting the length
-            if isinstance(self.limit_train_batches, float):
-                self.num_training_batches = len(self.train_dataloader)
+            if self.num_training_batches != float('inf'):
                 self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
-            else:
-                self.num_training_batches = min(len(self.train_dataloader), self.limit_train_batches)
+            elif self.limit_train_batches not in (0.0, 1.0):
+                raise MisconfigurationException(
+                    'When using an infinite DataLoader (e.g. with an IterableDataset'
+                    f' or when DataLoader does not implement `__len__`) for `limit_train_batches`,'
+                    f' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`')
 
         # determine when to check validation
         # if int passed in, val checks that often
@@ -308,20 +310,17 @@ def _reset_eval_dataloader(
                 # percent or num_steps
                 limit_eval_batches = getattr(self, f'limit_{mode}_batches')
 
-                if num_batches != float('inf'):
-                    self._check_batch_limits(f'limit_{mode}_batches')
-
-                    # limit num batches either as a percent or num steps
-                    if isinstance(limit_eval_batches, float):
+                # limit num batches either as a percent or num steps
+                if isinstance(limit_eval_batches, int):
+                    num_batches = min(num_batches, limit_eval_batches)
+                else:
+                    if num_batches != float('inf'):
                         num_batches = int(num_batches * limit_eval_batches)
-                    else:
-                        num_batches = min(len(dataloader), limit_eval_batches)
-
-                elif limit_eval_batches not in (0.0, 1.0):
-                    raise MisconfigurationException(
-                        'When using an infinite DataLoader (e.g. with an IterableDataset'
-                        f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,'
-                        f' `Trainer(limit_{mode}_batches)` must be `0.0` or `1.0`.')
+                    elif limit_eval_batches not in (0.0, 1.0):
+                        raise MisconfigurationException(
+                            'When using an infinite DataLoader (e.g. with an IterableDataset'
+                            f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,'
+                            f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or `int`')
 
                 if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                     min_pct = 1.0 / len(dataloader)

From cbddd35104d561bee7617dfc391ef0e92a7b5ebc Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Sat, 1 Aug 2020 18:10:01 +0530
Subject: [PATCH 21/39] flake8

---
 pytorch_lightning/trainer/data_loading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 143a132f12f98a..011ac1e4452ad7 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -224,8 +224,8 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
             elif self.limit_train_batches not in (0.0, 1.0):
                 raise MisconfigurationException(
                     'When using an infinite DataLoader (e.g. with an IterableDataset'
-                    f' or when DataLoader does not implement `__len__`) for `limit_train_batches`,'
-                    f' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`')
+                    ' or when DataLoader does not implement `__len__`) for `limit_train_batches`,'
+                    ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`')
 
         # determine when to check validation
         # if int passed in, val checks that often

From 486dbc6cd2f049f6fdf59cb0cf73d5bd27ba30a9 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Mon, 3 Aug 2020 02:05:10 +0530
Subject: [PATCH 22/39] revert and update

---
 pytorch_lightning/trainer/data_loading.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 011ac1e4452ad7..237a421fc83319 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -216,16 +216,16 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         self._worker_check(self.train_dataloader, 'train dataloader')
         self._check_batch_limits('limit_train_batches')
 
-        if isinstance(self.limit_train_batches, int):
-            self.num_training_batches = min(self.num_training_batches, self.limit_train_batches)
+        if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
+            self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
         else:
             if self.num_training_batches != float('inf'):
                 self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
             elif self.limit_train_batches not in (0.0, 1.0):
                 raise MisconfigurationException(
-                    'When using an infinite DataLoader (e.g. with an IterableDataset'
-                    ' or when DataLoader does not implement `__len__`) for `limit_train_batches`,'
-                    ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or `int`')
+                    'When using an IterableDataset for `limit_train_batches`,'
+                    ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
+                    ' num_training_batches to use.')
 
         # determine when to check validation
         # if int passed in, val checks that often
@@ -243,8 +243,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
                     self.val_check_batch = float('inf')
                 else:
                     raise MisconfigurationException(
-                        'When using an infinite DataLoader (e.g. with an IterableDataset'
-                        ' or when DataLoader does not implement `__len__`) for `train_dataloader`,'
+                        'When using an IterableDataset for `train_dataloader`,'
                         ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies'
                         ' checking validation every k training batches.')
             else:
@@ -311,16 +310,16 @@ def _reset_eval_dataloader(
                 limit_eval_batches = getattr(self, f'limit_{mode}_batches')
 
                 # limit num batches either as a percent or num steps
-                if isinstance(limit_eval_batches, int):
-                    num_batches = min(num_batches, limit_eval_batches)
+                if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
+                    num_batches = min(num_batches, int(limit_eval_batches))
                 else:
                     if num_batches != float('inf'):
                         num_batches = int(num_batches * limit_eval_batches)
                     elif limit_eval_batches not in (0.0, 1.0):
                         raise MisconfigurationException(
-                            'When using an infinite DataLoader (e.g. with an IterableDataset'
-                            f' or when DataLoader does not implement `__len__`) for `limit_{mode}_batches`,'
-                            f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or `int`')
+                            'When using an IterableDataset for `limit_{mode}_batches`,'
+                            f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
+                            f' num_{mode}_batches to use.')
 
                 if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                     min_pct = 1.0 / len(dataloader)

From 7d99c320155dcabfb9dd7b9b9a97d1958edd4ff9 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Mon, 3 Aug 2020 02:10:36 +0530
Subject: [PATCH 23/39] add and update tests

---
 tests/trainer/test_dataloaders.py | 75 +++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 8 deletions(-)

diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 1c7e21b7a72bb5..c05338ab724d19 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -256,6 +256,65 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path):
         f'Multiple `test_dataloaders` not initiated properly, got {trainer.test_dataloaders}'
 
 
+@pytest.mark.parametrize(
+    ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
+    [
+        pytest.param(0.0, 0.0, 0.0),
+        pytest.param(1.0, 1.0, 1.0),
+    ]
+)
+def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
+    """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent"""
+    model = EvalModelTemplate()
+    model.train_dataloader = model.train_dataloader__infinite
+    model.val_dataloader = model.val_dataloader__infinite
+    model.test_dataloader = model.test_dataloader__infinite
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=limit_val_batches,
+        limit_test_batches=limit_test_batches,
+    )
+
+    results = trainer.fit(model)
+    assert results == 1
+    assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf')
+    assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf')
+
+    trainer.test(ckpt_path=None)
+    assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf')
+
+
+@pytest.mark.parametrize(
+    ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
+    [pytest.param(10, 10, 10)]
+)
+def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
+    """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number"""
+    model = EvalModelTemplate()
+    model.train_dataloader = model.train_dataloader__infinite
+    model.val_dataloader = model.val_dataloader__infinite
+    model.test_dataloader = model.test_dataloader__infinite
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=limit_val_batches,
+        limit_test_batches=limit_test_batches,
+    )
+
+    results = trainer.fit(model)
+    assert results
+    assert trainer.num_training_batches == limit_train_batches
+    assert trainer.num_val_batches[0] == limit_val_batches
+
+    trainer.test(ckpt_path=None)
+    assert trainer.num_test_batches[0] == limit_test_batches
+
+
 @pytest.mark.parametrize(
     ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
     [
@@ -266,7 +325,7 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path):
     ]
 )
 def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify num_batches for val & test dataloaders passed with batch limit in percent"""
+    """Verify num_batches for train, val & test dataloaders passed with batch limit in percent"""
     model = EvalModelTemplate()
     model.val_dataloader = model.val_dataloader__multiple_mixed_length
     model.test_dataloader = model.test_dataloader__multiple_mixed_length
@@ -307,7 +366,7 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim
     ]
 )
 def test_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
-    """Verify num_batches for val & test dataloaders passed with batch limit as number"""
+    """Verify num_batches for train, val & test dataloaders passed with batch limit as number"""
     os.environ['PL_DEV_DEBUG'] = '1'
 
     model = EvalModelTemplate()
@@ -436,7 +495,7 @@ def test_train_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, val_check_interval=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -447,7 +506,7 @@ def test_val_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_val_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -458,7 +517,7 @@ def test_test_inf_dataloader_error(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, limit_test_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.test(model)
 
 
@@ -774,7 +833,7 @@ def test_train_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, val_check_interval=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -785,7 +844,7 @@ def test_val_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_val_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.fit(model)
 
 
@@ -796,5 +855,5 @@ def test_test_dataloader_not_implemented_error_failed(tmpdir):
 
     trainer = Trainer(default_root_dir=tmpdir, max_steps=5, max_epochs=1, limit_test_batches=0.5)
 
-    with pytest.raises(MisconfigurationException, match='infinite DataLoader'):
+    with pytest.raises(MisconfigurationException, match='using an IterableDataset'):
         trainer.test(model)

From 515d9be084caf28811a556cc12f4a8aa6e700577 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Mon, 3 Aug 2020 02:19:47 +0530
Subject: [PATCH 24/39] pep8

---
 tests/trainer/test_dataloaders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index c05338ab724d19..2df86c2ab52438 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -263,7 +263,8 @@ def test_multiple_dataloaders_passed_to_fit(tmpdir, ckpt_path):
         pytest.param(1.0, 1.0, 1.0),
     ]
 )
-def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
+def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches,
+                                                    limit_val_batches, limit_test_batches):
     """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit in percent"""
     model = EvalModelTemplate()
     model.train_dataloader = model.train_dataloader__infinite

From a5972f525fd7bcad15fde45560775525ca1f8369 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Tue, 4 Aug 2020 23:15:31 +0530
Subject: [PATCH 25/39] chlog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf8d002bce0e8b..d1e3e449ae6d33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,6 +35,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562))
 
+- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader(IterableDataset) ([2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787))
+
 ### Changed
 
 - Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594))

From b3efefb863fd7cabb05ceb3a48d552d6ad7bb091 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 5 Aug 2020 09:37:19 +0200
Subject: [PATCH 26/39] Update CHANGELOG.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d1e3e449ae6d33..d0c2e39a428f34 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,7 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562))
 
-- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader(IterableDataset) ([2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787))
+- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787))
 
 ### Changed
 

From e57d26e4c090a21d8d41998eed870a86f3c9ef79 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Wed, 5 Aug 2020 16:15:26 +0530
Subject: [PATCH 27/39] Add suggestions by @awaelchli

---
 pytorch_lightning/trainer/data_loading.py | 30 +++++++++++------------
 tests/trainer/test_dataloaders.py         |  5 +++-
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 237a421fc83319..18f095446dc655 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -218,14 +218,13 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
 
         if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
             self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
-        else:
-            if self.num_training_batches != float('inf'):
-                self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
-            elif self.limit_train_batches not in (0.0, 1.0):
-                raise MisconfigurationException(
-                    'When using an IterableDataset for `limit_train_batches`,'
-                    ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
-                    ' num_training_batches to use.')
+        elif self.num_training_batches != float('inf'):
+            self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
+        elif self.limit_train_batches not in (0.0, 1.0):
+            raise MisconfigurationException(
+                'When using an IterableDataset for `limit_train_batches`,'
+                ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
+                ' num_training_batches to use.')
 
         # determine when to check validation
         # if int passed in, val checks that often
@@ -312,14 +311,13 @@ def _reset_eval_dataloader(
                 # limit num batches either as a percent or num steps
                 if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
                     num_batches = min(num_batches, int(limit_eval_batches))
-                else:
-                    if num_batches != float('inf'):
-                        num_batches = int(num_batches * limit_eval_batches)
-                    elif limit_eval_batches not in (0.0, 1.0):
-                        raise MisconfigurationException(
-                            'When using an IterableDataset for `limit_{mode}_batches`,'
-                            f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
-                            f' num_{mode}_batches to use.')
+                elif num_batches != float('inf'):
+                    num_batches = int(num_batches * limit_eval_batches)
+                elif limit_eval_batches not in (0.0, 1.0):
+                    raise MisconfigurationException(
+                        'When using an IterableDataset for `limit_{mode}_batches`,'
+                        f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
+                        f' num_{mode}_batches to use.')
 
                 if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                     min_pct = 1.0 / len(dataloader)
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 2df86c2ab52438..1aad5047855a2e 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -290,7 +290,10 @@ def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches,
 
 @pytest.mark.parametrize(
     ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
-    [pytest.param(10, 10, 10)]
+    [
+        pytest.param(0, 0, 0),
+        pytest.param(10, 10, 10),
+    ]
 )
 def test_inf_dataloaders_with_limit_num_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches):
     """Verify inf train, val & test dataloaders (e.g. IterableDataset) passed with batch limit as number"""

From a3c1f17fb02fcee36ddcb1dbbf3008719a6cadd9 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Wed, 5 Aug 2020 16:35:33 +0530
Subject: [PATCH 28/39] docs

---
 docs/source/sequences.rst | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index e24ee5bbca1cc9..301e4efbba6df8 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -49,8 +49,8 @@ Lightning can handle TBTT automatically via this flag.
 .. note:: If you need to modify how the batch is split,
     override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`.
 
-.. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include
-    a `hiddens` arg.
+.. note:: Using this feature requires updating your LightningModule's
+    :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg.
 
 ----------
 
@@ -59,10 +59,13 @@ Iterable Datasets
 Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural
 option when using sequential data.
 
-.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or to an int
-    (specifying the number of training batches to run before validation) when initializing the Trainer.
-    This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate
-    the validation interval when val_check_interval is less than one.
+.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or an int
+    (specifying the number of training batches to run before validation) when initializing the Trainer. This is
+    because the IterableDataset does not have a __len__ and Lightning requires this to calculate the validation
+    interval when val_check_interval is less than one. Similarly, you can set limit_{mode}_batches to a float or
+    an int. If it is set to 0.0 or 0 it will set num_{mode}_batches to 0, if it is an int it will set num_{mode}_batches
+    to limit_{mode}_batches, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception.
+    Here mode can be train/val/test.
 
 .. testcode::
 
@@ -87,3 +90,9 @@ option when using sequential data.
 
     # Set val_check_interval
     trainer = Trainer(val_check_interval=100)
+
+    # Set limit_val_batches to 0.0 or 0
+    trainer = Trainer(limit_val_batches=0.0)
+
+    # Set limit_val_batches as an int
+    trainer = Trainer(limit_val_batches=100)

From 71c4679e3fb15446a9ae446591ec6ca63af85cfb Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 5 Aug 2020 15:15:41 +0200
Subject: [PATCH 29/39] Apply suggestions from code review

Co-authored-by: Ethan Harris <ewah1g13@soton.ac.uk>
---
 pytorch_lightning/trainer/data_loading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 18f095446dc655..ce6a7235b2bd4c 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -220,7 +220,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
             self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
         elif self.num_training_batches != float('inf'):
             self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
-        elif self.limit_train_batches not in (0.0, 1.0):
+        elif self.limit_train_batches != 1.0:
             raise MisconfigurationException(
                 'When using an IterableDataset for `limit_train_batches`,'
                 ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
@@ -313,7 +313,7 @@ def _reset_eval_dataloader(
                     num_batches = min(num_batches, int(limit_eval_batches))
                 elif num_batches != float('inf'):
                     num_batches = int(num_batches * limit_eval_batches)
-                elif limit_eval_batches not in (0.0, 1.0):
+                elif limit_eval_batches != 1.0:
                     raise MisconfigurationException(
                         'When using an IterableDataset for `limit_{mode}_batches`,'
                         f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'

From c7472b8a2fa568cd3f23e051bbc941e2ccb16b43 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Wed, 5 Aug 2020 15:25:49 +0200
Subject: [PATCH 30/39] Apply suggestions from code review

---
 docs/source/sequences.rst                 | 8 ++++----
 pytorch_lightning/core/lightning.py       | 2 +-
 pytorch_lightning/trainer/data_loading.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index 301e4efbba6df8..bb438154e39091 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -61,10 +61,10 @@ option when using sequential data.
 
 .. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or an int
     (specifying the number of training batches to run before validation) when initializing the Trainer. This is
-    because the IterableDataset does not have a __len__ and Lightning requires this to calculate the validation
-    interval when val_check_interval is less than one. Similarly, you can set limit_{mode}_batches to a float or
-    an int. If it is set to 0.0 or 0 it will set num_{mode}_batches to 0, if it is an int it will set num_{mode}_batches
-    to limit_{mode}_batches, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception.
+    because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation
+    interval when ``val_check_interval`` is less than one. Similarly, you can set limit_{mode}_batches to a float or
+    an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches``
+    to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception.
     Here mode can be train/val/test.
 
 .. testcode::
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index f816726ddf1e17..4189c828ed266e 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -1771,7 +1771,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg
         elif self.example_input_array is not None:
             input_data = self.example_input_array
         else:
-            raise ValueError('input_sample and example_input_array tensors are both missing.')
+            raise ValueError('`input_sample` and `example_input_array` tensors are both missing.')
 
         if 'example_outputs' not in kwargs:
             self.eval()
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index ce6a7235b2bd4c..53d624d46c0d9d 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -317,7 +317,7 @@ def _reset_eval_dataloader(
                     raise MisconfigurationException(
                         'When using an IterableDataset for `limit_{mode}_batches`,'
                         f' `Trainer(limit_{mode}_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
-                        f' num_{mode}_batches to use.')
+                        f' `num_{mode}_batches` to use.')
 
                 if num_batches == 0 and limit_eval_batches > 0.0 and isinstance(limit_eval_batches, float):
                     min_pct = 1.0 / len(dataloader)

From 9a13fff79ad74f846b9b0439992eb5df31622dbc Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka@pytorchlightning.ai>
Date: Wed, 5 Aug 2020 17:04:14 +0200
Subject: [PATCH 31/39] fix

---
 tests/models/test_onnx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index d7cc7cffaec3f7..278465941a043a 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -85,7 +85,7 @@ def test_error_if_no_input(tmpdir):
     model = EvalModelTemplate()
     model.example_input_array = None
     file_path = os.path.join(tmpdir, "model.onxx")
-    with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'):
+    with pytest.raises(ValueError, match=r'`input_sample` and `example_input_array` tensors are both missing'):
         model.to_onnx(file_path)
 
 

From 5b7c000e4525533e4a4347209c2d0bbadb0767f6 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <jirka@pytorchlightning.ai>
Date: Wed, 5 Aug 2020 17:06:49 +0200
Subject: [PATCH 32/39] max

---
 pytorch_lightning/trainer/data_loading.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 53d624d46c0d9d..c3567624b9821b 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -219,6 +219,7 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
             self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
         elif self.num_training_batches != float('inf'):
+            self.num_training_batches = min(1.0, self.num_training_batches)
             self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
         elif self.limit_train_batches != 1.0:
             raise MisconfigurationException(
@@ -312,6 +313,7 @@ def _reset_eval_dataloader(
                 if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
                     num_batches = min(num_batches, int(limit_eval_batches))
                 elif num_batches != float('inf'):
+                    num_batches = min(1.0, num_batches)
                     num_batches = int(num_batches * limit_eval_batches)
                 elif limit_eval_batches != 1.0:
                     raise MisconfigurationException(

From 71a773954037d4f596f916ae49569c4133cf1076 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Wed, 5 Aug 2020 22:10:28 +0530
Subject: [PATCH 33/39] check

---
 docs/source/sequences.rst                 | 4 ++--
 pytorch_lightning/trainer/data_loading.py | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index bb438154e39091..b9a8f2ee642aad 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -59,10 +59,10 @@ Iterable Datasets
 Lightning supports using IterableDatasets as well as map-style Datasets. IterableDatasets provide a more natural
 option when using sequential data.
 
-.. note:: When using an IterableDataset you must set the val_check_interval to 1.0 (the default) or an int
+.. note:: When using an IterableDataset you must set the ``val_check_interval`` to 1.0 (the default) or an int
     (specifying the number of training batches to run before validation) when initializing the Trainer. This is
     because the IterableDataset does not have a ``__len__`` and Lightning requires this to calculate the validation
-    interval when ``val_check_interval`` is less than one. Similarly, you can set limit_{mode}_batches to a float or
+    interval when ``val_check_interval`` is less than one. Similarly, you can set ``limit_{mode}_batches`` to a float or
     an int. If it is set to 0.0 or 0 it will set ``num_{mode}_batches`` to 0, if it is an int it will set ``num_{mode}_batches``
     to ``limit_{mode}_batches``, if it is set to 1.0 it will run for the whole dataset, otherwise it will throw an exception.
     Here mode can be train/val/test.
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index c3567624b9821b..4eec847580636f 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -219,13 +219,12 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
         if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
             self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
         elif self.num_training_batches != float('inf'):
-            self.num_training_batches = min(1.0, self.num_training_batches)
             self.num_training_batches = int(self.num_training_batches * self.limit_train_batches)
         elif self.limit_train_batches != 1.0:
             raise MisconfigurationException(
                 'When using an IterableDataset for `limit_train_batches`,'
                 ' `Trainer(limit_train_batches)` must be `0.0`, `1.0` or an int. An int k specifies'
-                ' num_training_batches to use.')
+                ' `num_training_batches` to use.')
 
         # determine when to check validation
         # if int passed in, val checks that often
@@ -305,6 +304,7 @@ def _reset_eval_dataloader(
             for i, dataloader in enumerate(dataloaders):
                 num_batches = len(dataloader) if _has_len(dataloader) else float('inf')
                 self._worker_check(dataloader, f'{mode} dataloader {i}')
+                self._check_batch_limits(f'limit_{mode}_batches')
 
                 # percent or num_steps
                 limit_eval_batches = getattr(self, f'limit_{mode}_batches')
@@ -313,7 +313,6 @@ def _reset_eval_dataloader(
                 if isinstance(limit_eval_batches, int) or limit_eval_batches == 0.0:
                     num_batches = min(num_batches, int(limit_eval_batches))
                 elif num_batches != float('inf'):
-                    num_batches = min(1.0, num_batches)
                     num_batches = int(num_batches * limit_eval_batches)
                 elif limit_eval_batches != 1.0:
                     raise MisconfigurationException(

From 3685e3a490196647d18658f67ed7271b9892c9ed Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Thu, 6 Aug 2020 01:56:42 +0530
Subject: [PATCH 34/39] check

---
 pytorch_lightning/trainer/data_loading.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 4eec847580636f..edfe4b72f35b28 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -385,9 +385,6 @@ def request_dataloader(self, dataloader_fx: Callable) -> DataLoader:
     def determine_data_use_amount(self, overfit_batches: float) -> None:
         """Use less data for debugging purposes"""
         if overfit_batches > 0:
-            if isinstance(overfit_batches, float) and overfit_batches > 1:
-                raise ValueError('`overfit_batches` when used as a percentage must'
-                                 f' be in range 0.0 < x < 1.0 but got {overfit_batches:.3f}.')
             self.limit_train_batches = overfit_batches
             self.limit_val_batches = overfit_batches
             self.limit_test_batches = overfit_batches

From a0dfd845b02db9b1e6ebae1cb2e901c983955036 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Thu, 6 Aug 2020 02:17:27 +0530
Subject: [PATCH 35/39] check

---
 pytorch_lightning/trainer/data_loading.py | 19 -------------------
 pytorch_lightning/trainer/trainer.py      |  8 +++-----
 tests/trainer/test_dataloaders.py         |  8 +++-----
 3 files changed, 6 insertions(+), 29 deletions(-)

diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index edfe4b72f35b28..38a1118118a403 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -103,21 +103,6 @@ class TrainerDataLoadingMixin(ABC):
     def is_overridden(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
-    def _check_batch_limits(self, name: str) -> None:
-        # TODO: verify it is still needed and deprecate it..
-        value = getattr(self, name)
-
-        # ints are fine
-        if isinstance(value, int):
-            return
-
-        msg = f'`{name}` must lie in the range [0.0, 1.0], but got {value:.3f}. (or pass in an int)'
-        if name == 'val_check_interval':
-            msg += ' If you want to disable validation set `limit_val_batches` to 0.0 instead.'
-
-        if not 0. <= value <= 1.:
-            raise ValueError(msg)
-
     def _worker_check(self, dataloader: DataLoader, name: str) -> None:
         on_windows = platform.system() == 'Windows'
 
@@ -214,7 +199,6 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
 
         self.num_training_batches = len(self.train_dataloader) if _has_len(self.train_dataloader) else float('inf')
         self._worker_check(self.train_dataloader, 'train dataloader')
-        self._check_batch_limits('limit_train_batches')
 
         if isinstance(self.limit_train_batches, int) or self.limit_train_batches == 0.0:
             self.num_training_batches = min(self.num_training_batches, int(self.limit_train_batches))
@@ -246,8 +230,6 @@ def reset_train_dataloader(self, model: LightningModule) -> None:
                         ' `Trainer(val_check_interval)` must be `1.0` or an int. An int k specifies'
                         ' checking validation every k training batches.')
             else:
-                self._check_batch_limits('val_check_interval')
-
                 self.val_check_batch = int(self.num_training_batches * self.val_check_interval)
                 self.val_check_batch = max(1, self.val_check_batch)
 
@@ -304,7 +286,6 @@ def _reset_eval_dataloader(
             for i, dataloader in enumerate(dataloaders):
                 num_batches = len(dataloader) if _has_len(dataloader) else float('inf')
                 self._worker_check(dataloader, f'{mode} dataloader {i}')
-                self._check_batch_limits(f'limit_{mode}_batches')
 
                 # percent or num_steps
                 limit_eval_batches = getattr(self, f'limit_{mode}_batches')
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 4b342328df2979..a0e89f586c6059 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -534,7 +534,6 @@ def __init__(
         # logging
         self.configure_logger(logger)
         self.log_save_interval = log_save_interval
-        self.val_check_interval = val_check_interval
         self.row_log_interval = row_log_interval
 
         # how much of the data to use
@@ -547,9 +546,6 @@ def __init__(
             )
             overfit_batches = overfit_pct
 
-        # convert floats to ints
-        self.overfit_batches = _determine_limit_batches(overfit_batches)
-
         # TODO: remove in 0.10.0
         if val_percent_check is not None:
             rank_zero_warn(
@@ -580,6 +576,8 @@ def __init__(
         self.limit_test_batches = _determine_limit_batches(limit_test_batches)
         self.limit_val_batches = _determine_limit_batches(limit_val_batches)
         self.limit_train_batches = _determine_limit_batches(limit_train_batches)
+        self.val_check_interval = _determine_limit_batches(val_check_interval)
+        self.overfit_batches = _determine_limit_batches(overfit_batches)
         self.determine_data_use_amount(self.overfit_batches)
 
         # AMP init
@@ -1437,5 +1435,5 @@ def _determine_limit_batches(batches: Union[int, float]) -> Union[int, float]:
         return int(batches)
     else:
         raise MisconfigurationException(
-            f'You have passed invalid value {batches}, it has to be in (0, 1) or nature number.'
+            f'You have passed invalid value {batches}, it has to be in (0, 1) or an int.'
         )
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 1aad5047855a2e..854d5770720a23 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -53,19 +53,15 @@ def test_fit_val_loader_only(tmpdir):
 
 
 @pytest.mark.parametrize("dataloader_options", [
-    dict(val_check_interval=1.1),
     dict(val_check_interval=10000),
 ])
 def test_dataloader_config_errors_runtime(tmpdir, dataloader_options):
-
     model = EvalModelTemplate()
-
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
         **dataloader_options,
     )
-
     with pytest.raises(ValueError):
         # fit model
         trainer.fit(model)
@@ -78,9 +74,11 @@ def test_dataloader_config_errors_runtime(tmpdir, dataloader_options):
     dict(limit_val_batches=1.2),
     dict(limit_test_batches=-0.1),
     dict(limit_test_batches=1.2),
+    dict(val_check_interval=1.1),
+    dict(overfit_batches=1.1),
 ])
 def test_dataloader_config_errors_init(tmpdir, dataloader_options):
-    with pytest.raises(MisconfigurationException):
+    with pytest.raises(MisconfigurationException, match='passed invalid value'):
         Trainer(
             default_root_dir=tmpdir,
             max_epochs=1,

From aefd79986acbb59ab4b758964b83e0a42249ccd3 Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Thu, 6 Aug 2020 02:25:57 +0530
Subject: [PATCH 36/39] chlog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0c2e39a428f34..2396dab38539fa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -35,7 +35,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562))
 
-- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2787](https://github.com/PyTorchLightning/pytorch-lightning/pull/2787))
+- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2840](https://github.com/PyTorchLightning/pytorch-lightning/pull/2840))
 
 ### Changed
 

From ff7353e8a46c8db2dde3c2f5226d79745111a08d Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Thu, 6 Aug 2020 02:32:41 +0530
Subject: [PATCH 37/39] tests

---
 tests/trainer/test_dataloaders.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 854d5770720a23..c4a6ad8d10aed4 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -74,8 +74,10 @@ def test_dataloader_config_errors_runtime(tmpdir, dataloader_options):
     dict(limit_val_batches=1.2),
     dict(limit_test_batches=-0.1),
     dict(limit_test_batches=1.2),
-    dict(val_check_interval=1.1),
-    dict(overfit_batches=1.1),
+    dict(val_check_interval=-0.1),
+    dict(val_check_interval=1.2),
+    dict(overfit_batches=-0.1),
+    dict(overfit_batches=1.2),
 ])
 def test_dataloader_config_errors_init(tmpdir, dataloader_options):
     with pytest.raises(MisconfigurationException, match='passed invalid value'):

From cf3935c3aca1ef2fbf623f0523d50b3374d979cf Mon Sep 17 00:00:00 2001
From: rohitgr7 <rohitgr1998@gmail.com>
Date: Thu, 6 Aug 2020 22:59:21 +0530
Subject: [PATCH 38/39] update exception message

---
 pytorch_lightning/trainer/trainer.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a0e89f586c6059..ea8d7941d95f60 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -573,11 +573,11 @@ def __init__(
             )
             limit_train_batches = train_percent_check
 
-        self.limit_test_batches = _determine_limit_batches(limit_test_batches)
-        self.limit_val_batches = _determine_limit_batches(limit_val_batches)
-        self.limit_train_batches = _determine_limit_batches(limit_train_batches)
-        self.val_check_interval = _determine_limit_batches(val_check_interval)
-        self.overfit_batches = _determine_limit_batches(overfit_batches)
+        self.limit_train_batches = _determine_batch_limits(limit_train_batches, 'limit_train_batches')
+        self.limit_val_batches = _determine_batch_limits(limit_val_batches, 'limit_val_batches')
+        self.limit_test_batches = _determine_batch_limits(limit_test_batches, 'limit_test_batches')
+        self.val_check_interval = _determine_batch_limits(val_check_interval, 'val_check_interval')
+        self.overfit_batches = _determine_batch_limits(overfit_batches, 'overfit_batches')
         self.determine_data_use_amount(self.overfit_batches)
 
         # AMP init
@@ -1428,12 +1428,12 @@ def __call__(self) -> Union[List[DataLoader], DataLoader]:
         return self.dataloader
 
 
-def _determine_limit_batches(batches: Union[int, float]) -> Union[int, float]:
+def _determine_batch_limits(batches: Union[int, float], name: str) -> Union[int, float]:
     if 0 <= batches <= 1:
         return batches
     elif batches > 1 and batches % 1.0 == 0:
         return int(batches)
     else:
         raise MisconfigurationException(
-            f'You have passed invalid value {batches}, it has to be in (0, 1) or an int.'
+            f'You have passed invalid value {batches} for {name}, it has to be in [0.0, 1.0] or an int.'
         )

From cb7d3a736f50df9074b3f21716dbb37c7652f25d Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 6 Aug 2020 21:16:56 +0200
Subject: [PATCH 39/39] Apply suggestions from code review

---
 tests/trainer/test_dataloaders.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index c4a6ad8d10aed4..27dc7ac89100bb 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -281,11 +281,11 @@ def test_inf_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches,
 
     results = trainer.fit(model)
     assert results == 1
-    assert trainer.num_training_batches == 0 if limit_train_batches == 0.0 else float('inf')
-    assert trainer.num_val_batches[0] == 0 if limit_val_batches == 0.0 else float('inf')
+    assert trainer.num_training_batches == (0 if limit_train_batches == 0.0 else float('inf'))
+    assert trainer.num_val_batches[0] == (0 if limit_val_batches == 0.0 else float('inf'))
 
     trainer.test(ckpt_path=None)
-    assert trainer.num_test_batches[0] == 0 if limit_test_batches == 0.0 else float('inf')
+    assert trainer.num_test_batches[0] == (0 if limit_test_batches == 0.0 else float('inf'))
 
 
 @pytest.mark.parametrize(