From 4130ec41e8442495473c7e0b6b0e3fa6b10feb7d Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 11 Jul 2020 23:10:47 -0400
Subject: [PATCH 01/25] add tests for single scalar return from training

---
 tests/base/deterministic_model.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index c387997da57d7..db9355a043e93 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -15,6 +15,10 @@ def __init__(self, weights=None):
         self.training_step_end_called = False
         self.training_epoch_end_called = False
 
+        self.validation_step_called = False
+        self.validation_step_end_called = False
+        self.validation_epoch_end_called = False
+
         self.l1 = nn.Linear(2, 3, bias=False)
         if weights is None:
             weights = torch.tensor([
@@ -162,6 +166,17 @@ def training_epoch_end_dict(self, outputs):
 
         return {'log': logs, 'progress_bar': pbar}
 
+    def validation_step_no_return(self, batch, batch_idx):
+        acc = self.step(batch, batch_idx)
+
+    def validation_step_scalar_return(self, batch, batch_idx):
+        acc = self.step(batch, batch_idx)
+        return acc
+
+    def validation_step_arbitary_dict_return(self, batch, batch_idx):
+        acc = self.step(batch, batch_idx)
+        return {'some': acc, 'value': 'a'}
+
     def validation_step_dict_return(self, batch, batch_idx):
         acc = self.step(batch, batch_idx)
 

From 4b12043d099afb31a46d0238011f20ea4e6d475d Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sat, 11 Jul 2020 23:10:57 -0400
Subject: [PATCH 02/25] add tests for single scalar return from training

---
 tests/trainer/test_eval_loop_dict_return.py | 166 ++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 tests/trainer/test_eval_loop_dict_return.py

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
new file mode 100644
index 0000000000000..62244a5b9e464
--- /dev/null
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -0,0 +1,166 @@
+"""
+Tests to ensure that the training loop works with a dict
+"""
+from pytorch_lightning import Trainer
+from tests.base.deterministic_model import DeterministicModel
+
+# train step + val step (no return)
+# train step + val step (scalar return)
+# train loop + val step (arbitrary dict return)
+# train loop + val step (structured return)
+# train loop + val step + val step end
+# train loop + val step + val step end + val epoch end
+# train loop + val step + val epoch end
+
+
+def test_validation_step_dict(tmpdir):
+    """
+    test that the train + val loop can be used
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_dict_return
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+    # make sure correct steps were called
+    assert model.training_step_called
+    assert not model.training_step_end_called
+    assert not model.training_epoch_end_called
+
+    # make sure training outputs what is expected
+    for batch_idx, batch in enumerate(model.train_dataloader()):
+        break
+
+    out = trainer.run_training_batch(batch, batch_idx)
+    assert out.signal == 0
+    assert out.batch_log_metrics['log_acc1'] == 12.0
+    assert out.batch_log_metrics['log_acc2'] == 7.0
+
+    train_step_out = out.training_step_output_for_epoch_end
+    pbar_metrics = train_step_out['progress_bar']
+    assert 'log' in train_step_out
+    assert 'progress_bar' in train_step_out
+    assert train_step_out['train_step_test'] == 549
+    assert pbar_metrics['pbar_acc1'] == 17.0
+    assert pbar_metrics['pbar_acc2'] == 19.0
+
+    # make sure the optimizer closure returns the correct things
+    opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
+    assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
+
+
+def training_step_with_step_end(tmpdir):
+    """
+    Checks train_step + training_step_end
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_for_step_end_dict
+    model.training_step_end = model.training_step_end_dict
+    model.val_dataloader = None
+
+    trainer = Trainer(fast_dev_run=True, weights_summary=None)
+    trainer.fit(model)
+
+    # make sure correct steps were called
+    assert model.training_step_called
+    assert model.training_step_end_called
+    assert not model.training_epoch_end_called
+
+    # make sure training outputs what is expected
+    for batch_idx, batch in enumerate(model.train_dataloader()):
+        break
+
+    out = trainer.run_training_batch(batch, batch_idx)
+    assert out.signal == 0
+    assert out.batch_log_metrics['log_acc1'] == 14.0
+    assert out.batch_log_metrics['log_acc2'] == 9.0
+
+    train_step_end_out = out.training_step_output_for_epoch_end
+    pbar_metrics = train_step_end_out['progress_bar']
+    assert 'train_step_end' in train_step_end_out
+    assert pbar_metrics['pbar_acc1'] == 19.0
+    assert pbar_metrics['pbar_acc2'] == 21.0
+
+
+def test_full_training_loop_dict(tmpdir):
+    """
+    Checks train_step + training_step_end + training_epoch_end
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_for_step_end_dict
+    model.training_step_end = model.training_step_end_dict
+    model.training_epoch_end = model.training_epoch_end_dict
+    model.val_dataloader = None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+    # make sure correct steps were called
+    assert model.training_step_called
+    assert model.training_step_end_called
+    assert model.training_epoch_end_called
+
+    # assert epoch end metrics were added
+    assert trainer.callback_metrics['epoch_end_log_1'] == 178
+    assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234
+
+    # make sure training outputs what is expected
+    for batch_idx, batch in enumerate(model.train_dataloader()):
+        break
+
+    out = trainer.run_training_batch(batch, batch_idx)
+    assert out.signal == 0
+    assert out.batch_log_metrics['log_acc1'] == 14.0
+    assert out.batch_log_metrics['log_acc2'] == 9.0
+
+    train_step_end_out = out.training_step_output_for_epoch_end
+    pbar_metrics = train_step_end_out['progress_bar']
+    assert pbar_metrics['pbar_acc1'] == 19.0
+    assert pbar_metrics['pbar_acc2'] == 21.0
+
+
+def test_train_step_epoch_end(tmpdir):
+    """
+    Checks train_step + training_epoch_end (NO training_step_end)
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.training_step_end = None
+    model.training_epoch_end = model.training_epoch_end_dict
+    model.val_dataloader = None
+
+    trainer = Trainer(max_epochs=1, weights_summary=None)
+    trainer.fit(model)
+
+    # make sure correct steps were called
+    assert model.training_step_called
+    assert not model.training_step_end_called
+    assert model.training_epoch_end_called
+
+    # assert epoch end metrics were added
+    assert trainer.callback_metrics['epoch_end_log_1'] == 178
+    assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234
+
+    # make sure training outputs what is expected
+    for batch_idx, batch in enumerate(model.train_dataloader()):
+        break
+
+    out = trainer.run_training_batch(batch, batch_idx)
+    assert out.signal == 0
+    assert out.batch_log_metrics['log_acc1'] == 12.0
+    assert out.batch_log_metrics['log_acc2'] == 7.0
+
+    train_step_end_out = out.training_step_output_for_epoch_end
+    pbar_metrics = train_step_end_out['progress_bar']
+    assert pbar_metrics['pbar_acc1'] == 17.0
+    assert pbar_metrics['pbar_acc2'] == 19.0

From 3466e7e1a5ed8ec78c284a5fad2a9ddd981feab2 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Mon, 13 Jul 2020 11:51:06 -0400
Subject: [PATCH 03/25] add tests for single scalar return from training

---
 pl_examples/domain_templates/imagenet.py     |  2 +-
 pytorch_lightning/trainer/evaluation_loop.py | 12 ++--
 pytorch_lightning/trainer/logging.py         |  7 ++
 pytorch_lightning/trainer/trainer.py         |  2 +-
 tests/base/deterministic_model.py            | 13 ++++
 tests/trainer/test_eval_loop_dict_return.py  | 75 ++++++++++++--------
 6 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
index 19a85b87949df..20fb1cae24732 100644
--- a/pl_examples/domain_templates/imagenet.py
+++ b/pl_examples/domain_templates/imagenet.py
@@ -245,7 +245,7 @@ def main(args: Namespace) -> None:
     )
 
     if args.evaluate:
-        trainer.run_evaluation()
+        trainer.test()
     else:
         trainer.fit(model)
 
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 16f68f1e13502..2bfb54c5d9372 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -315,7 +315,7 @@ def _evaluate(
 
         # with a single dataloader don't pass an array
         if len(dataloaders) == 1:
-            outputs = outputs[0]
+            eval_results = outputs[0]
 
         # give model a chance to do something with the outputs (and method defined)
         if isinstance(model, (LightningDistributedDataParallel, LightningDataParallel)):
@@ -324,22 +324,22 @@ def _evaluate(
         if test_mode:
             if self.is_overridden('test_end', model=model):
                 # TODO: remove in v1.0.0
-                eval_results = model.test_end(outputs)
+                eval_results = model.test_end(eval_results)
                 rank_zero_warn('Method `test_end` was deprecated in v0.7 and will be removed in v1.0.'
                                ' Use `test_epoch_end` instead.', DeprecationWarning)
 
             elif self.is_overridden('test_epoch_end', model=model):
-                eval_results = model.test_epoch_end(outputs)
+                eval_results = model.test_epoch_end(eval_results)
 
         else:
             if self.is_overridden('validation_end', model=model):
                 # TODO: remove in v1.0.0
-                eval_results = model.validation_end(outputs)
+                eval_results = model.validation_end(eval_results)
                 rank_zero_warn('Method `validation_end` was deprecated in v0.7 and will be removed in v1.0.'
                                ' Use `validation_epoch_end` instead.', DeprecationWarning)
 
             elif self.is_overridden('validation_epoch_end', model=model):
-                eval_results = model.validation_epoch_end(outputs)
+                eval_results = model.validation_epoch_end(eval_results)
 
         # enable train mode again
         model.train()
@@ -429,7 +429,7 @@ def run_evaluation(self, test_mode: bool = False):
         else:
             self.on_validation_end()
 
-        return callback_metrics
+        return callback_metrics, eval_results
 
     def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False):
         # make dataloader_idx arg in validation_step optional
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index 35f5d5d35b9ca..e5911a87923e0 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -109,6 +109,13 @@ def process_output(self, output, train=False):
             hiddens = None
             return output, progress_bar_metrics, log_metrics, callback_metrics, hiddens
 
+        # --------------------------
+        # handle lists
+        # --------------------------
+        if isinstance(output, list):
+            # TODO: what to do when given a list?
+            output = {}
+
         # ---------------
         # EXTRACT CALLBACK KEYS
         # ---------------
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 770dc4b314688..762d7eca2b090 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1163,7 +1163,7 @@ def run_pretrain_routine(self, model: LightningModule):
         if self.testing:
             # only load test dataloader for testing
             # self.reset_test_dataloader(ref_model)
-            results = self.run_evaluation(test_mode=True)
+            results, _ = self.run_evaluation(test_mode=True)
 
             # remove all cuda tensors
             if results is not None and isinstance(results, dict) and len(results) > 0:
diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index db9355a043e93..52aed0e6acdd8 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -167,23 +167,36 @@ def training_epoch_end_dict(self, outputs):
         return {'log': logs, 'progress_bar': pbar}
 
     def validation_step_no_return(self, batch, batch_idx):
+        self.validation_step_called = True
         acc = self.step(batch, batch_idx)
 
     def validation_step_scalar_return(self, batch, batch_idx):
+        self.validation_step_called = True
         acc = self.step(batch, batch_idx)
         return acc
 
     def validation_step_arbitary_dict_return(self, batch, batch_idx):
+        self.validation_step_called = True
         acc = self.step(batch, batch_idx)
         return {'some': acc, 'value': 'a'}
 
     def validation_step_dict_return(self, batch, batch_idx):
+        self.validation_step_called = True
         acc = self.step(batch, batch_idx)
 
         logs = {'log_acc1': torch.tensor(12).type_as(acc), 'log_acc2': torch.tensor(7).type_as(acc)}
         pbar = {'pbar_acc1': torch.tensor(17).type_as(acc), 'pbar_acc2': torch.tensor(19).type_as(acc)}
         return {'val_loss': acc, 'log': logs, 'progress_bar': pbar}
 
+    def validation_step_end(self, outputs):
+        self.validation_step_end_called = True
+
+    def validation_epoch_end(self, outputs):
+        self.validation_epoch_end_called = True
+
+    # -----------------------------
+    # DATA
+    # -----------------------------
     def train_dataloader(self):
         return DataLoader(DummyDataset(), batch_size=3, shuffle=False)
 
diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 62244a5b9e464..2d99221d8735a 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -13,13 +13,15 @@
 # train loop + val step + val epoch end
 
 
-def test_validation_step_dict(tmpdir):
+def test_validation_step_no_return(tmpdir):
     """
-    test that the train + val loop can be used
+    Test that val step can return nothing
     """
     model = DeterministicModel()
     model.training_step = model.training_step_dict_return
-    model.validation_step = model.validation_step_dict_return
+    model.validation_step = model.validation_step_no_return
+    model.validation_step_end = None
+    model.validation_epoch_end = None
 
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -28,32 +30,50 @@ def test_validation_step_dict(tmpdir):
     )
     trainer.fit(model)
 
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    out, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(out) == 0
+    assert len(eval_results) == 0
+
     # make sure correct steps were called
-    assert model.training_step_called
-    assert not model.training_step_end_called
-    assert not model.training_epoch_end_called
+    assert model.validation_step_called
+    assert not model.validation_step_end_called
+    assert not model.validation_epoch_end_called
 
-    # make sure training outputs what is expected
-    for batch_idx, batch in enumerate(model.train_dataloader()):
-        break
 
-    out = trainer.run_training_batch(batch, batch_idx)
-    assert out.signal == 0
-    assert out.batch_log_metrics['log_acc1'] == 12.0
-    assert out.batch_log_metrics['log_acc2'] == 7.0
+def test_validation_step_scalar_return(tmpdir):
+    """
+    Test that val step can return a scalar
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_scalar_return
+    model.validation_step_end = None
+    model.validation_epoch_end = None
 
-    train_step_out = out.training_step_output_for_epoch_end
-    pbar_metrics = train_step_out['progress_bar']
-    assert 'log' in train_step_out
-    assert 'progress_bar' in train_step_out
-    assert train_step_out['train_step_test'] == 549
-    assert pbar_metrics['pbar_acc1'] == 17.0
-    assert pbar_metrics['pbar_acc2'] == 19.0
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=2
+    )
+    trainer.fit(model)
+
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    out, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(out) == 0
+    assert len(eval_results) == 2
+    assert eval_results[0] == 171 and eval_results[1] == 171
+
+    # make sure correct steps were called
+    assert model.validation_step_called
+    assert not model.validation_step_end_called
+    assert not model.validation_epoch_end_called
 
-    # make sure the optimizer closure returns the correct things
-    opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
-    assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3)
 
+test_validation_step_scalar_return('')
 
 def training_step_with_step_end(tmpdir):
     """
@@ -73,8 +93,7 @@ def training_step_with_step_end(tmpdir):
     assert not model.training_epoch_end_called
 
     # make sure training outputs what is expected
-    for batch_idx, batch in enumerate(model.train_dataloader()):
-        break
+    batch_idx, batch = 0, next(iter(model.train_dataloader()))
 
     out = trainer.run_training_batch(batch, batch_idx)
     assert out.signal == 0
@@ -115,8 +134,7 @@ def test_full_training_loop_dict(tmpdir):
     assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234
 
     # make sure training outputs what is expected
-    for batch_idx, batch in enumerate(model.train_dataloader()):
-        break
+    batch_idx, batch = 0, next(iter(model.train_dataloader()))
 
     out = trainer.run_training_batch(batch, batch_idx)
     assert out.signal == 0
@@ -152,8 +170,7 @@ def test_train_step_epoch_end(tmpdir):
     assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234
 
     # make sure training outputs what is expected
-    for batch_idx, batch in enumerate(model.train_dataloader()):
-        break
+    batch_idx, batch = 0, next(iter(model.train_dataloader()))
 
     out = trainer.run_training_batch(batch, batch_idx)
     assert out.signal == 0

From 2d1b213e1601288c1dd227d976eda3b1bb41b8e1 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:27:42 -0400
Subject: [PATCH 04/25] fixing val step only

---
 pytorch_lightning/trainer/evaluation_loop.py | 32 ++++++++++++--------
 pytorch_lightning/trainer/logging.py         |  7 -----
 pytorch_lightning/trainer/trainer.py         | 27 +++++++++++------
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 2bfb54c5d9372..c8da00ae22542 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -393,23 +393,29 @@ def run_evaluation(self, test_mode: bool = False):
         # enable no returns
         callback_metrics = {}
         if eval_results is not None and len(eval_results) > 0:
-            _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output(eval_results)
 
-            # add metrics to prog bar
-            self.add_progress_bar_metrics(prog_bar_metrics)
+            # in eval, the user may return something at every validation step without final reduction
+            if not isinstance(eval_results, list):
+                eval_results = [eval_results]
 
-            # log results of test
-            if test_mode and self.is_global_zero:
-                print('-' * 80)
-                print('TEST RESULTS')
-                pprint(callback_metrics)
-                print('-' * 80)
+            for result in eval_results:
+                _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.process_output(result)
 
-            # log metrics
-            self.log_metrics(log_metrics, {})
+                # add metrics to prog bar
+                self.add_progress_bar_metrics(prog_bar_metrics)
 
-            # track metrics for callbacks
-            self.callback_metrics.update(callback_metrics)
+                # log results of test
+                if test_mode and self.is_global_zero:
+                    print('-' * 80)
+                    print('TEST RESULTS')
+                    pprint(callback_metrics)
+                    print('-' * 80)
+
+                # log metrics
+                self.log_metrics(log_metrics, {})
+
+                # track metrics for callbacks
+                self.callback_metrics.update(callback_metrics)
 
         # hook
         model.on_post_performance_check()
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index e5911a87923e0..35f5d5d35b9ca 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -109,13 +109,6 @@ def process_output(self, output, train=False):
             hiddens = None
             return output, progress_bar_metrics, log_metrics, callback_metrics, hiddens
 
-        # --------------------------
-        # handle lists
-        # --------------------------
-        if isinstance(output, list):
-            # TODO: what to do when given a list?
-            output = {}
-
         # ---------------
         # EXTRACT CALLBACK KEYS
         # ---------------
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 762d7eca2b090..917b73158c464 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1179,6 +1179,20 @@ def run_pretrain_routine(self, model: LightningModule):
         self.disable_validation = not (self.is_overridden('validation_step') and self.limit_val_batches > 0) \
             and not self.fast_dev_run
 
+        # run a few val batches before training starts
+        self._run_sanity_check(ref_model, model)
+
+        # clear cache before training
+        if self.on_gpu and self.root_gpu is not None:
+            # use context because of:
+            # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
+            with torch.cuda.device(f'cuda:{self.root_gpu}'):
+                torch.cuda.empty_cache()
+
+        # CORE TRAINING LOOP
+        self.train()
+
+    def _run_sanity_check(self, ref_model, model):
         # run tiny validation (if validation defined)
         # to make sure program won't crash during val
         if not self.disable_validation and self.num_sanity_val_steps > 0:
@@ -1197,21 +1211,14 @@ def run_pretrain_routine(self, model: LightningModule):
 
             # allow no returns from eval
             if eval_results is not None and len(eval_results) > 0:
+                # when we get a list back, used only the last item
+                if isinstance(eval_results, list):
+                    eval_results = eval_results[-1]
                 _, _, _, callback_metrics, _ = self.process_output(eval_results)
                 self.callback_metrics = callback_metrics
 
             self.on_sanity_check_end()
 
-        # clear cache before training
-        if self.on_gpu and self.root_gpu is not None:
-            # use context because of:
-            # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
-            with torch.cuda.device(f'cuda:{self.root_gpu}'):
-                torch.cuda.empty_cache()
-
-        # CORE TRAINING LOOP
-        self.train()
-
     def test(
             self,
             model: Optional[LightningModule] = None,

From 07115f43805be5fe48705b743590bfcba058907e Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:30:09 -0400
Subject: [PATCH 05/25] fixing val step only

---
 pytorch_lightning/trainer/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 917b73158c464..85c2dc1f7c0dd 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1125,7 +1125,6 @@ def run_pretrain_routine(self, model: LightningModule):
         if self.logger is not None:
             # save exp to get started
             self.logger.log_hyperparams(ref_model.hparams)
-
             self.logger.save()
 
         if self.use_ddp or self.use_ddp2:

From 034860cd6691ed7b147443150a0f7af3a38c67ef Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:33:42 -0400
Subject: [PATCH 06/25] fixing val step only

---
 tests/trainer/test_eval_loop_dict_return.py | 37 ++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 2d99221d8735a..347e29352d4d1 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -73,7 +73,42 @@ def test_validation_step_scalar_return(tmpdir):
     assert not model.validation_epoch_end_called
 
 
-test_validation_step_scalar_return('')
+def test_validation_step_arbitrary_dict_return(tmpdir):
+    """
+    Test that val step can return a scalar
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_arbitary_dict_return
+    model.validation_step_end = None
+    model.validation_epoch_end = None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=2
+    )
+    trainer.fit(model)
+
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(callback_metrics) == 2
+    assert len(eval_results) == 2
+    assert eval_results[0]['some'] == 171
+    assert eval_results[1]['some'] == 171
+
+    assert eval_results[0]['value'] == 'a'
+    assert eval_results[1]['value'] == 'a'
+
+    # make sure correct steps were called
+    assert model.validation_step_called
+    assert not model.validation_step_end_called
+    assert not model.validation_epoch_end_called
+
+test_validation_step_arbitrary_dict_return('')
+
 
 def training_step_with_step_end(tmpdir):
     """

From 2023f4dcf471ea1ada63fb215d64e0d2cc5d4729 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:40:47 -0400
Subject: [PATCH 07/25] fixing val step only

---
 tests/base/deterministic_model.py           |  2 +-
 tests/trainer/test_eval_loop_dict_return.py | 40 ++++++++++++++++++---
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index 52aed0e6acdd8..7afd9d6905517 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -184,7 +184,7 @@ def validation_step_dict_return(self, batch, batch_idx):
         self.validation_step_called = True
         acc = self.step(batch, batch_idx)
 
-        logs = {'log_acc1': torch.tensor(12).type_as(acc), 'log_acc2': torch.tensor(7).type_as(acc)}
+        logs = {'log_acc1': torch.tensor(12 + batch_idx).type_as(acc), 'log_acc2': torch.tensor(7).type_as(acc)}
         pbar = {'pbar_acc1': torch.tensor(17).type_as(acc), 'pbar_acc2': torch.tensor(19).type_as(acc)}
         return {'val_loss': acc, 'log': logs, 'progress_bar': pbar}
 
diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 347e29352d4d1..2ec5231fcb710 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -4,9 +4,6 @@
 from pytorch_lightning import Trainer
 from tests.base.deterministic_model import DeterministicModel
 
-# train step + val step (no return)
-# train step + val step (scalar return)
-# train loop + val step (arbitrary dict return)
 # train loop + val step (structured return)
 # train loop + val step + val step end
 # train loop + val step + val step end + val epoch end
@@ -107,7 +104,42 @@ def test_validation_step_arbitrary_dict_return(tmpdir):
     assert not model.validation_step_end_called
     assert not model.validation_epoch_end_called
 
-test_validation_step_arbitrary_dict_return('')
+
+def test_validation_step_dict_return(tmpdir):
+    """
+    Test that val step can return a scalar
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_dict_return
+    model.validation_step_end = None
+    model.validation_epoch_end = None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=2
+    )
+    trainer.fit(model)
+
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(callback_metrics) == 5
+    assert len(eval_results) == 2
+    assert eval_results[0]['log']['log_acc1'] == 12
+    assert eval_results[1]['log']['log_acc1'] == 13
+
+    for k in ['val_loss', 'log', 'progress_bar']:
+        assert k in eval_results[0]
+        assert k in eval_results[1]
+
+    # ensure all the keys ended up as candidates for callbacks
+    assert len(trainer.callback_metrics) == 7
+
+
+test_validation_step_dict_return('')
 
 
 def training_step_with_step_end(tmpdir):

From c56acea431bcdcd861f0605ba2eb1ee662e0817a Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:47:15 -0400
Subject: [PATCH 08/25] fixing val step only

---
 tests/trainer/test_eval_loop_dict_return.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 2ec5231fcb710..8fae348f2befc 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -72,7 +72,7 @@ def test_validation_step_scalar_return(tmpdir):
 
 def test_validation_step_arbitrary_dict_return(tmpdir):
     """
-    Test that val step can return a scalar
+    Test that val step can return an arbitrary dict
     """
     model = DeterministicModel()
     model.training_step = model.training_step_dict_return
@@ -107,7 +107,8 @@ def test_validation_step_arbitrary_dict_return(tmpdir):
 
 def test_validation_step_dict_return(tmpdir):
     """
-    Test that val step can return a scalar
+    Test that val step can return a dict with all the expected keys and they end up
+    in the correct place
     """
     model = DeterministicModel()
     model.training_step = model.training_step_dict_return

From a5098ea8d82af213eaeeaf6313d27ac8a35593e0 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:47:21 -0400
Subject: [PATCH 09/25] fixing val step only

---
 tests/trainer/test_eval_loop_dict_return.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 8fae348f2befc..18c51768376b8 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -4,7 +4,6 @@
 from pytorch_lightning import Trainer
 from tests.base.deterministic_model import DeterministicModel
 
-# train loop + val step (structured return)
 # train loop + val step + val step end
 # train loop + val step + val step end + val epoch end
 # train loop + val step + val epoch end

From b4d8b364b810daf01c8d59097b68d4f73bb15bed Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 06:58:38 -0400
Subject: [PATCH 10/25] fixing val step only

---
 pytorch_lightning/trainer/evaluation_loop.py |   3 +-
 tests/base/deterministic_model.py            |  15 ++-
 tests/trainer/test_eval_loop_dict_return.py  | 133 +++++++------------
 3 files changed, 64 insertions(+), 87 deletions(-)

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index c8da00ae22542..2ec67caa9bd15 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -307,7 +307,8 @@ def _evaluate(
                     self.on_validation_batch_end()
 
                 # track outputs for collation
-                dl_outputs.append(output)
+                if output is not None:
+                    dl_outputs.append(output)
 
             outputs.append(dl_outputs)
 
diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index 7afd9d6905517..31e005c92bc3a 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -188,9 +188,22 @@ def validation_step_dict_return(self, batch, batch_idx):
         pbar = {'pbar_acc1': torch.tensor(17).type_as(acc), 'pbar_acc2': torch.tensor(19).type_as(acc)}
         return {'val_loss': acc, 'log': logs, 'progress_bar': pbar}
 
-    def validation_step_end(self, outputs):
+    def validation_step_end_no_return(self, val_step_output):
+        assert len(val_step_output) == 3
+        assert val_step_output['val_loss'] == 171
+        assert val_step_output['log']['log_acc1'] >= 12
+        assert val_step_output['progress_bar']['pbar_acc1'] == 17
         self.validation_step_end_called = True
 
+    def validation_step_end(self, val_step_output):
+        assert len(val_step_output) == 3
+        assert val_step_output['val_loss'] == 171
+        assert val_step_output['log']['log_acc1'] >= 12
+        assert val_step_output['progress_bar']['pbar_acc1'] == 17
+        self.validation_step_end_called = True
+
+        return val_step_output
+
     def validation_epoch_end(self, outputs):
         self.validation_epoch_end_called = True
 
diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 18c51768376b8..655c4b1399d2a 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -138,113 +138,76 @@ def test_validation_step_dict_return(tmpdir):
     # ensure all the keys ended up as candidates for callbacks
     assert len(trainer.callback_metrics) == 7
 
-
-test_validation_step_dict_return('')
+    # make sure correct steps were called
+    assert model.validation_step_called
+    assert not model.validation_step_end_called
+    assert not model.validation_epoch_end_called
 
 
-def training_step_with_step_end(tmpdir):
+def test_val_step_step_end_no_return(tmpdir):
     """
-    Checks train_step + training_step_end
+    Test that val step + val step end work
     """
     model = DeterministicModel()
-    model.training_step = model.training_step_for_step_end_dict
-    model.training_step_end = model.training_step_end_dict
-    model.val_dataloader = None
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_dict_return
+    model.validation_step_end = model.validation_step_end_no_return
+    model.validation_epoch_end = None
 
-    trainer = Trainer(fast_dev_run=True, weights_summary=None)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=2
+    )
     trainer.fit(model)
 
-    # make sure correct steps were called
-    assert model.training_step_called
-    assert model.training_step_end_called
-    assert not model.training_epoch_end_called
-
-    # make sure training outputs what is expected
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-
-    out = trainer.run_training_batch(batch, batch_idx)
-    assert out.signal == 0
-    assert out.batch_log_metrics['log_acc1'] == 14.0
-    assert out.batch_log_metrics['log_acc2'] == 9.0
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(callback_metrics) == 0
+    assert len(eval_results) == 0
 
-    train_step_end_out = out.training_step_output_for_epoch_end
-    pbar_metrics = train_step_end_out['progress_bar']
-    assert 'train_step_end' in train_step_end_out
-    assert pbar_metrics['pbar_acc1'] == 19.0
-    assert pbar_metrics['pbar_acc2'] == 21.0
+    # make sure correct steps were called
+    assert model.validation_step_called
+    assert model.validation_step_end_called
+    assert not model.validation_epoch_end_called
 
 
-def test_full_training_loop_dict(tmpdir):
+def test_val_step_step_end(tmpdir):
     """
-    Checks train_step + training_step_end + training_epoch_end
+    Test that val step + val step end work
     """
     model = DeterministicModel()
-    model.training_step = model.training_step_for_step_end_dict
-    model.training_step_end = model.training_step_end_dict
-    model.training_epoch_end = model.training_epoch_end_dict
-    model.val_dataloader = None
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_dict_return
+    model.validation_step_end = model.validation_step_end
+    model.validation_epoch_end = None
 
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1,
         weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=2
     )
     trainer.fit(model)
 
-    # make sure correct steps were called
-    assert model.training_step_called
-    assert model.training_step_end_called
-    assert model.training_epoch_end_called
-
-    # assert epoch end metrics were added
-    assert trainer.callback_metrics['epoch_end_log_1'] == 178
-    assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234
-
-    # make sure training outputs what is expected
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-
-    out = trainer.run_training_batch(batch, batch_idx)
-    assert out.signal == 0
-    assert out.batch_log_metrics['log_acc1'] == 14.0
-    assert out.batch_log_metrics['log_acc2'] == 9.0
-
-    train_step_end_out = out.training_step_output_for_epoch_end
-    pbar_metrics = train_step_end_out['progress_bar']
-    assert pbar_metrics['pbar_acc1'] == 19.0
-    assert pbar_metrics['pbar_acc2'] == 21.0
-
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(callback_metrics) == 5
+    assert len(eval_results) == 2
+    assert eval_results[0]['log']['log_acc1'] == 12
+    assert eval_results[1]['log']['log_acc1'] == 13
 
-def test_train_step_epoch_end(tmpdir):
-    """
-    Checks train_step + training_epoch_end (NO training_step_end)
-    """
-    model = DeterministicModel()
-    model.training_step = model.training_step_dict_return
-    model.training_step_end = None
-    model.training_epoch_end = model.training_epoch_end_dict
-    model.val_dataloader = None
+    for k in ['val_loss', 'log', 'progress_bar']:
+        assert k in eval_results[0]
+        assert k in eval_results[1]
 
-    trainer = Trainer(max_epochs=1, weights_summary=None)
-    trainer.fit(model)
+    # ensure all the keys ended up as candidates for callbacks
+    assert len(trainer.callback_metrics) == 8
 
     # make sure correct steps were called
-    assert model.training_step_called
-    assert not model.training_step_end_called
-    assert model.training_epoch_end_called
-
-    # assert epoch end metrics were added
-    assert trainer.callback_metrics['epoch_end_log_1'] == 178
-    assert trainer.progress_bar_metrics['epoch_end_pbar_1'] == 234
-
-    # make sure training outputs what is expected
-    batch_idx, batch = 0, next(iter(model.train_dataloader()))
-
-    out = trainer.run_training_batch(batch, batch_idx)
-    assert out.signal == 0
-    assert out.batch_log_metrics['log_acc1'] == 12.0
-    assert out.batch_log_metrics['log_acc2'] == 7.0
-
-    train_step_end_out = out.training_step_output_for_epoch_end
-    pbar_metrics = train_step_end_out['progress_bar']
-    assert pbar_metrics['pbar_acc1'] == 17.0
-    assert pbar_metrics['pbar_acc2'] == 19.0
+    assert model.validation_step_called
+    assert model.validation_step_end_called
+    assert not model.validation_epoch_end_called

From 265eb2df8d7b2e4c18425f2ba712fd1a9333c65b Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 07:23:10 -0400
Subject: [PATCH 11/25] fixing val step only

---
 tests/base/deterministic_model.py           | 11 +++
 tests/trainer/test_eval_loop_dict_return.py | 87 ++++++++++++++++++++-
 2 files changed, 96 insertions(+), 2 deletions(-)

diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index 31e005c92bc3a..a4988673c60a4 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -202,11 +202,22 @@ def validation_step_end(self, val_step_output):
         assert val_step_output['progress_bar']['pbar_acc1'] == 17
         self.validation_step_end_called = True
 
+        val_step_output['val_step_end'] = torch.tensor(1802)
+
         return val_step_output
 
     def validation_epoch_end(self, outputs):
+        assert len(outputs) == self.trainer.num_val_batches[0]
+
+        for i, out in enumerate(outputs):
+            assert out['log']['log_acc1'] >= 12 + i
+
         self.validation_epoch_end_called = True
 
+        result = outputs[-1]
+        result['val_epoch_end'] = torch.tensor(1233)
+        return result
+
     # -----------------------------
     # DATA
     # -----------------------------
diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 655c4b1399d2a..381bf0a7c0a07 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -195,7 +195,8 @@ def test_val_step_step_end(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(callback_metrics) == 5
+    assert len(callback_metrics) == 6
+    assert callback_metrics['val_step_end'] == 1802
     assert len(eval_results) == 2
     assert eval_results[0]['log']['log_acc1'] == 12
     assert eval_results[1]['log']['log_acc1'] == 13
@@ -205,9 +206,91 @@ def test_val_step_step_end(tmpdir):
         assert k in eval_results[1]
 
     # ensure all the keys ended up as candidates for callbacks
-    assert len(trainer.callback_metrics) == 8
+    assert len(trainer.callback_metrics) == 9
 
     # make sure correct steps were called
     assert model.validation_step_called
     assert model.validation_step_end_called
     assert not model.validation_epoch_end_called
+
+
+def test_no_val_step_end(tmpdir):
+    """
+    Test that val step + val epoch end
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_dict_return
+    model.validation_step_end = None
+    model.validation_epoch_end = model.validation_epoch_end
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=3,
+        num_sanity_val_steps=0
+    )
+    trainer.fit(model)
+
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(callback_metrics) == 6
+    assert len(eval_results) == 1
+
+    eval_results = eval_results[0]
+    assert 'val_step_end' not in eval_results
+    assert eval_results['val_epoch_end'] == 1233
+
+    for k in ['val_loss', 'log', 'progress_bar']:
+        assert k in eval_results
+
+    # ensure all the keys ended up as candidates for callbacks
+    assert len(trainer.callback_metrics) == 9
+
+    # make sure correct steps were called
+    assert model.validation_step_called
+    assert not model.validation_step_end_called
+    assert model.validation_epoch_end_called
+
+
+def test_full_val_loop(tmpdir):
+    """
+    Test that val step + val step + val epoch end
+    """
+    model = DeterministicModel()
+    model.training_step = model.training_step_dict_return
+    model.validation_step = model.validation_step_dict_return
+    model.validation_step_end = model.validation_step_end
+    model.validation_epoch_end = model.validation_epoch_end
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        weights_summary=None,
+        limit_train_batches=2,
+        limit_val_batches=3,
+        num_sanity_val_steps=0
+    )
+    trainer.fit(model)
+
+    # out are the results of the full loop
+    # eval_results are output of _evaluate
+    callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
+    assert len(callback_metrics) == 7
+    assert len(eval_results) == 1
+
+    eval_results = eval_results[0]
+    assert eval_results['val_step_end'] == 1802
+    assert eval_results['val_epoch_end'] == 1233
+
+    for k in ['val_loss', 'log', 'progress_bar']:
+        assert k in eval_results
+
+    # ensure all the keys ended up as candidates for callbacks
+    assert len(trainer.callback_metrics) == 10
+
+    # make sure correct steps were called
+    assert model.validation_step_called
+    assert model.validation_step_end_called
+    assert model.validation_epoch_end_called

From 9366372e45a3bd552b2fbb6d3d1e5686d0a86709 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 07:23:28 -0400
Subject: [PATCH 12/25] fixing val step only

---
 tests/trainer/test_eval_loop_dict_return.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 381bf0a7c0a07..1007dcf24003d 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -4,10 +4,6 @@
 from pytorch_lightning import Trainer
 from tests.base.deterministic_model import DeterministicModel
 
-# train loop + val step + val step end
-# train loop + val step + val step end + val epoch end
-# train loop + val step + val epoch end
-
 
 def test_validation_step_no_return(tmpdir):
     """

From 7d38dceeb12caa799fff96959f0b3e4d3ed3369f Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 07:25:54 -0400
Subject: [PATCH 13/25] fixing val step only

---
 tests/trainer/test_eval_loop_dict_return.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 1007dcf24003d..9bb301c56734e 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -142,7 +142,7 @@ def test_validation_step_dict_return(tmpdir):
 
 def test_val_step_step_end_no_return(tmpdir):
     """
-    Test that val step + val step end work
+    Test that val step + val step end work (with no return in val step end)
     """
     model = DeterministicModel()
     model.training_step = model.training_step_dict_return
@@ -253,7 +253,7 @@ def test_no_val_step_end(tmpdir):
 
 def test_full_val_loop(tmpdir):
     """
-    Test that val step + val step + val epoch end
+    Test that val step + val step end + val epoch end
     """
     model = DeterministicModel()
     model.training_step = model.training_step_dict_return

From 5e7f61be79566da3a77b5c1a5cf61bff8a6e5569 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 10:27:54 -0400
Subject: [PATCH 14/25] fixing val step only

---
 pytorch_lightning/trainer/evaluation_loop.py |  7 +++++--
 pytorch_lightning/trainer/trainer.py         | 21 ++++++++++----------
 tests/trainer/test_eval_loop_dict_return.py  | 21 +++++++++++++-------
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 2ec67caa9bd15..c21dc20fe50f6 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -392,7 +392,7 @@ def run_evaluation(self, test_mode: bool = False):
         eval_results = self._evaluate(self.model, dataloaders, max_batches, test_mode)
 
         # enable no returns
-        callback_metrics = {}
+        eval_loop_results = []
         if eval_results is not None and len(eval_results) > 0:
 
             # in eval, the user may return something at every validation step without final reduction
@@ -418,6 +418,9 @@ def run_evaluation(self, test_mode: bool = False):
                 # track metrics for callbacks
                 self.callback_metrics.update(callback_metrics)
 
+                if len(callback_metrics) > 0:
+                    eval_loop_results.append(callback_metrics)
+
         # hook
         model.on_post_performance_check()
 
@@ -436,7 +439,7 @@ def run_evaluation(self, test_mode: bool = False):
         else:
             self.on_validation_end()
 
-        return callback_metrics, eval_results
+        return eval_loop_results, eval_results
 
     def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False):
         # make dataloader_idx arg in validation_step optional
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 85c2dc1f7c0dd..84ccdf136b248 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -128,7 +128,8 @@ class Trainer(
         >>> trainer = Trainer(max_epochs=1, progress_bar_refresh_rate=0)
         >>> trainer.fit(model, train_loader)
         1
-        >>> trainer.test(model, train_loader)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+        >>> test_outputs = trainer.test(model, train_loader)
+        >>> len(test_outputs)# doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
         1
     """
     DEPRECATED_IN_0_9 = ('use_amp', 'show_progress_bar', 'training_tqdm_dict', 'num_tpu_cores')
@@ -1162,18 +1163,18 @@ def run_pretrain_routine(self, model: LightningModule):
         if self.testing:
             # only load test dataloader for testing
             # self.reset_test_dataloader(ref_model)
-            results, _ = self.run_evaluation(test_mode=True)
+            eval_loop_results, _ = self.run_evaluation(test_mode=True)
 
-            # remove all cuda tensors
-            if results is not None and isinstance(results, dict) and len(results) > 0:
-                for k, v in results.items():
-                    if isinstance(v, torch.Tensor):
-                        results[k] = v.cpu().item()
-
-                return results
-            else:
+            if len(eval_loop_results) == 0:
                 return 1
 
+            # remove the tensors from the eval results
+            for i, result in eval_loop_results:
+                if isinstance(result, dict):
+                    for k, v in result.items():
+                        if isinstance(v, torch.Tensor):
+                            result[k] = v.cpu().item()
+
         # check if we should run validation during training
         self.disable_validation = not (self.is_overridden('validation_step') and self.limit_val_batches > 0) \
             and not self.fast_dev_run
diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index 9bb301c56734e..ef3a18fa1d979 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -48,7 +48,8 @@ def test_validation_step_scalar_return(tmpdir):
         default_root_dir=tmpdir,
         weights_summary=None,
         limit_train_batches=2,
-        limit_val_batches=2
+        limit_val_batches=2,
+        max_epochs=2
     )
     trainer.fit(model)
 
@@ -79,7 +80,8 @@ def test_validation_step_arbitrary_dict_return(tmpdir):
         default_root_dir=tmpdir,
         weights_summary=None,
         limit_train_batches=2,
-        limit_val_batches=2
+        limit_val_batches=2,
+        max_epochs=2
     )
     trainer.fit(model)
 
@@ -115,7 +117,8 @@ def test_validation_step_dict_return(tmpdir):
         default_root_dir=tmpdir,
         weights_summary=None,
         limit_train_batches=2,
-        limit_val_batches=2
+        limit_val_batches=2,
+        max_epochs=2
     )
     trainer.fit(model)
 
@@ -154,7 +157,8 @@ def test_val_step_step_end_no_return(tmpdir):
         default_root_dir=tmpdir,
         weights_summary=None,
         limit_train_batches=2,
-        limit_val_batches=2
+        limit_val_batches=2,
+        max_epochs=2
     )
     trainer.fit(model)
 
@@ -184,7 +188,8 @@ def test_val_step_step_end(tmpdir):
         default_root_dir=tmpdir,
         weights_summary=None,
         limit_train_batches=2,
-        limit_val_batches=2
+        limit_val_batches=2,
+        max_epochs=2
     )
     trainer.fit(model)
 
@@ -225,7 +230,8 @@ def test_no_val_step_end(tmpdir):
         weights_summary=None,
         limit_train_batches=2,
         limit_val_batches=3,
-        num_sanity_val_steps=0
+        num_sanity_val_steps=0,
+        max_epochs=2
     )
     trainer.fit(model)
 
@@ -266,7 +272,8 @@ def test_full_val_loop(tmpdir):
         weights_summary=None,
         limit_train_batches=2,
         limit_val_batches=3,
-        num_sanity_val_steps=0
+        num_sanity_val_steps=0,
+        max_epochs=2
     )
     trainer.fit(model)
 

From 5b4b4ed115046078b0d622eeba48554541b70a16 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 10:30:09 -0400
Subject: [PATCH 15/25] fixing val step only

---
 pytorch_lightning/trainer/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 84ccdf136b248..ae03be6fbf6f7 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -129,8 +129,8 @@ class Trainer(
         >>> trainer.fit(model, train_loader)
         1
         >>> test_outputs = trainer.test(model, train_loader)
-        >>> len(test_outputs)# doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-        1
+        >>> len(test_outputs)
+        4
     """
     DEPRECATED_IN_0_9 = ('use_amp', 'show_progress_bar', 'training_tqdm_dict', 'num_tpu_cores')
 

From c65d5a84a70e71154fa75af526caa841b5669cb8 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 10:39:10 -0400
Subject: [PATCH 16/25] fixing val step only

---
 pytorch_lightning/trainer/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ae03be6fbf6f7..c7905db586e52 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -128,8 +128,8 @@ class Trainer(
         >>> trainer = Trainer(max_epochs=1, progress_bar_refresh_rate=0)
         >>> trainer.fit(model, train_loader)
         1
-        >>> test_outputs = trainer.test(model, train_loader)
-        >>> len(test_outputs)
+        >>> test_outputs = trainer.test(model, train_loader)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+        >>> len(test_outputs)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
         4
     """
     DEPRECATED_IN_0_9 = ('use_amp', 'show_progress_bar', 'training_tqdm_dict', 'num_tpu_cores')

From aaa8f7890568ecc870fd2f5e461e2e3fa2cee4ac Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 11:14:00 -0400
Subject: [PATCH 17/25] fixing val step only

---
 pytorch_lightning/trainer/evaluation_loop.py |  3 ++-
 pytorch_lightning/trainer/trainer.py         | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index c21dc20fe50f6..f79c0dc724b56 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -176,6 +176,7 @@ class TrainerEvaluationLoopMixin(ABC):
     use_tpu: bool
     reload_dataloaders_every_epoch: ...
     tpu_id: int
+    verbose_test: bool
 
     # Callback system
     on_validation_batch_start: Callable
@@ -406,7 +407,7 @@ def run_evaluation(self, test_mode: bool = False):
                 self.add_progress_bar_metrics(prog_bar_metrics)
 
                 # log results of test
-                if test_mode and self.is_global_zero:
+                if test_mode and self.is_global_zero and self.verbose_test:
                     print('-' * 80)
                     print('TEST RESULTS')
                     pprint(callback_metrics)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index c7905db586e52..4b3738dc1d70f 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -128,7 +128,7 @@ class Trainer(
         >>> trainer = Trainer(max_epochs=1, progress_bar_refresh_rate=0)
         >>> trainer.fit(model, train_loader)
         1
-        >>> test_outputs = trainer.test(model, train_loader)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+        >>> test_outputs = trainer.test(model, train_loader, verbose=False)
         >>> len(test_outputs)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
         4
     """
@@ -397,6 +397,9 @@ def __init__(
         self.test_dataloaders = None
         self.val_dataloaders = None
 
+        # when true, prints test results
+        self.verbose_test = True
+
         # when .test() is called, it sets this
         self.tested_ckpt_path = None
 
@@ -1223,7 +1226,8 @@ def test(
             self,
             model: Optional[LightningModule] = None,
             test_dataloaders: Optional[Union[DataLoader, List[DataLoader]]] = None,
-            ckpt_path: Optional[str] = 'best'
+            ckpt_path: Optional[str] = 'best',
+            verbose: bool = True
     ):
         r"""
 
@@ -1238,6 +1242,11 @@ def test(
             ckpt_path: Either ``best`` or path to the checkpoint you wish to test.
                 If ``None``, use the weights from the last epoch to test. Default to ``best``.
 
+            verbose: If True, prints the test results
+
+        Returns:
+            The final test result dictionary. If no test_epoch_end is defined returns a list of dictionaries
+
         Example::
 
             # Option 1
@@ -1277,6 +1286,8 @@ def test(
         # --------------------
         # SETUP HOOK
         # --------------------
+        self.verbose_test = verbose
+
         if self.global_rank != 0:
             return
 

From 08119511faad583a123fc9c0a9961ef985cba297 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 11:24:57 -0400
Subject: [PATCH 18/25] fixing val step only

---
 pytorch_lightning/trainer/trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 4b3738dc1d70f..d2c3406ef0d7e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1178,6 +1178,8 @@ def run_pretrain_routine(self, model: LightningModule):
                         if isinstance(v, torch.Tensor):
                             result[k] = v.cpu().item()
 
+            return eval_loop_results
+
         # check if we should run validation during training
         self.disable_validation = not (self.is_overridden('validation_step') and self.limit_val_batches > 0) \
             and not self.fast_dev_run

From 4c7c80da4e1fed36216c3f099ea8dc7f73fa20e2 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 11:26:58 -0400
Subject: [PATCH 19/25] fixing val step only

---
 pytorch_lightning/trainer/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index d2c3406ef0d7e..70e9c0f16dfba 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -129,8 +129,8 @@ class Trainer(
         >>> trainer.fit(model, train_loader)
         1
         >>> test_outputs = trainer.test(model, train_loader, verbose=False)
-        >>> len(test_outputs)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
-        4
+        >>> len(test_outputs)
+        25
     """
     DEPRECATED_IN_0_9 = ('use_amp', 'show_progress_bar', 'training_tqdm_dict', 'num_tpu_cores')
 

From f66dec9b2ca65e6012e386544b4b2907d698a030 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 11:36:41 -0400
Subject: [PATCH 20/25] fixing val step only

---
 pytorch_lightning/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 70e9c0f16dfba..1f611ab7ac57c 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1172,7 +1172,7 @@ def run_pretrain_routine(self, model: LightningModule):
                 return 1
 
             # remove the tensors from the eval results
-            for i, result in eval_loop_results:
+            for i, result in enumerate(eval_loop_results):
                 if isinstance(result, dict):
                     for k, v in result.items():
                         if isinstance(v, torch.Tensor):

From 3fb798253c7936e155ea589cae225e5567c19eec Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 11:47:07 -0400
Subject: [PATCH 21/25] fixing val step only

---
 pytorch_lightning/trainer/evaluation_loop.py | 2 +-
 tests/trainer/test_dataloaders.py            | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index f79c0dc724b56..4519d05c74fdf 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -313,7 +313,7 @@ def _evaluate(
 
             outputs.append(dl_outputs)
 
-        eval_results = {}
+        eval_results = outputs
 
         # with a single dataloader don't pass an array
         if len(dataloaders) == 1:
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index e76ef0e556352..05c003a766049 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -115,6 +115,8 @@ def test_multiple_val_dataloader(tmpdir):
         tpipes.run_prediction(dataloader, trainer.model)
 
 
+test_multiple_val_dataloader('')
+
 @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])
 def test_multiple_test_dataloader(tmpdir, ckpt_path):
     """Verify multiple test_dataloader."""

From aa95e4fdaefbce6a9d4766c781ff47f7c00733f0 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 12:09:44 -0400
Subject: [PATCH 22/25] fixing val step only

---
 pytorch_lightning/trainer/evaluation_loop.py | 2 +-
 tests/trainer/test_dataloaders.py            | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 4519d05c74fdf..440a4ea4e6ac3 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -387,7 +387,7 @@ def run_evaluation(self, test_mode: bool = False):
         # enable disabling validation step with limit_val_batches = 0
         should_skip = sum(max_batches) == 0
         if should_skip:
-            return
+            return [], []
 
         # run evaluation
         eval_results = self._evaluate(self.model, dataloaders, max_batches, test_mode)
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 05c003a766049..85b706e1dc9a4 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -115,8 +115,6 @@ def test_multiple_val_dataloader(tmpdir):
         tpipes.run_prediction(dataloader, trainer.model)
 
 
-test_multiple_val_dataloader('')
-
 @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])
 def test_multiple_test_dataloader(tmpdir, ckpt_path):
     """Verify multiple test_dataloader."""
@@ -297,7 +295,6 @@ def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, lim
     ]
     assert trainer.num_test_batches == expected_test_batches
 
-
 @pytest.mark.parametrize(
     ['limit_train_batches', 'limit_val_batches', 'limit_test_batches'],
     [

From d40c4dd15bb9c224dfb2a7676b6bef640ca7e3f7 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 12:23:55 -0400
Subject: [PATCH 23/25] fixing val step only

---
 tests/trainer/test_eval_loop_dict_return.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/trainer/test_eval_loop_dict_return.py b/tests/trainer/test_eval_loop_dict_return.py
index ef3a18fa1d979..d4e845badeb9b 100644
--- a/tests/trainer/test_eval_loop_dict_return.py
+++ b/tests/trainer/test_eval_loop_dict_return.py
@@ -125,7 +125,8 @@ def test_validation_step_dict_return(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(callback_metrics) == 5
+    assert len(callback_metrics) == 2
+    assert len(callback_metrics[0]) == 5
     assert len(eval_results) == 2
     assert eval_results[0]['log']['log_acc1'] == 12
     assert eval_results[1]['log']['log_acc1'] == 13
@@ -135,7 +136,7 @@ def test_validation_step_dict_return(tmpdir):
         assert k in eval_results[1]
 
     # ensure all the keys ended up as candidates for callbacks
-    assert len(trainer.callback_metrics) == 7
+    assert len(trainer.callback_metrics) == 8
 
     # make sure correct steps were called
     assert model.validation_step_called
@@ -196,7 +197,10 @@ def test_val_step_step_end(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(callback_metrics) == 6
+    assert len(callback_metrics) == 2
+    assert len(callback_metrics[0]) == 6
+
+    callback_metrics = callback_metrics[0]
     assert callback_metrics['val_step_end'] == 1802
     assert len(eval_results) == 2
     assert eval_results[0]['log']['log_acc1'] == 12
@@ -238,7 +242,8 @@ def test_no_val_step_end(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(callback_metrics) == 6
+    assert len(callback_metrics) == 1
+    assert len(callback_metrics[0]) == 6
     assert len(eval_results) == 1
 
     eval_results = eval_results[0]
@@ -280,7 +285,8 @@ def test_full_val_loop(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(callback_metrics) == 7
+    assert len(callback_metrics) == 1
+    assert len(callback_metrics[0]) == 7
     assert len(eval_results) == 1
 
     eval_results = eval_results[0]

From 2943f07d834079871acebca9825d60b66d6d94d4 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 12:33:54 -0400
Subject: [PATCH 24/25] fixing val step only

---
 tests/models/test_test_loop.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_test_loop.py b/tests/models/test_test_loop.py
index 89103116bd8f3..c65809ad25221 100644
--- a/tests/models/test_test_loop.py
+++ b/tests/models/test_test_loop.py
@@ -21,12 +21,12 @@ def test_single_gpu_test(tmpdir):
     trainer.fit(model)
     assert 'ckpt' in trainer.checkpoint_callback.best_model_path
     results = trainer.test()
-    assert 'test_acc' in results
+    assert 'test_acc' in results[0]
 
     old_weights = model.c_d1.weight.clone().detach().cpu()
 
     results = trainer.test(model)
-    assert 'test_acc' in results
+    assert 'test_acc' in results[0]
 
     # make sure weights didn't change
     new_weights = model.c_d1.weight.clone().detach().cpu()
@@ -50,12 +50,12 @@ def test_dp_test(tmpdir):
     trainer.fit(model)
     assert 'ckpt' in trainer.checkpoint_callback.best_model_path
     results = trainer.test()
-    assert 'test_acc' in results
+    assert 'test_acc' in results[0]
 
     old_weights = model.c_d1.weight.clone().detach().cpu()
 
     results = trainer.test(model)
-    assert 'test_acc' in results
+    assert 'test_acc' in results[0]
 
     # make sure weights didn't change
     new_weights = model.c_d1.weight.clone().detach().cpu()
@@ -79,12 +79,12 @@ def test_ddp_spawn_test(tmpdir):
     trainer.fit(model)
     assert 'ckpt' in trainer.checkpoint_callback.best_model_path
     results = trainer.test()
-    assert 'test_acc' in results
+    assert 'test_acc' in results[0]
 
     old_weights = model.c_d1.weight.clone().detach().cpu()
 
     results = trainer.test(model)
-    assert 'test_acc' in results
+    assert 'test_acc' in results[0]
 
     # make sure weights didn't change
     new_weights = model.c_d1.weight.clone().detach().cpu()

From d8f8977cfe76a1a814d4491c1444f7092be074dc Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Tue, 14 Jul 2020 12:43:34 -0400
Subject: [PATCH 25/25] fixing val step only

---
 tests/models/test_restore.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 9331d6c7a540f..244439f7634d7 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -52,7 +52,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
     pretrained_model.cpu()
 
     # test we have good test accuracy
-    acc = results['test_acc']
+    acc = results[0]['test_acc']
     assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
 
     dataloaders = model.test_dataloader()
@@ -102,7 +102,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     results = new_trainer.test(pretrained_model)
     pretrained_model.cpu()
 
-    acc = results['test_acc']
+    acc = results[0]['test_acc']
     assert acc > 0.5, f"Model failed to get expected {0.5} accuracy. test_acc = {acc}"
 
     dataloaders = model.test_dataloader()