From 4d2b081968f57b2c0dfb2b70a5a24d5c2975e1ec Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 14 Jul 2020 21:03:20 -0400 Subject: [PATCH 001/168] r --- tests/base/model_valid_epoch_ends.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/base/model_valid_epoch_ends.py b/tests/base/model_valid_epoch_ends.py index 5170527397548..a7295aa9caef0 100644 --- a/tests/base/model_valid_epoch_ends.py +++ b/tests/base/model_valid_epoch_ends.py @@ -35,7 +35,6 @@ def validation_epoch_end__multiple_dataloaders(self, outputs): Args: outputs: list of individual outputs of each validation step """ - # if returned a scalar from validation_step, outputs is a list of tensor scalars # we return just the average in this case (if we want) def _mean(res, key): From 4513eb33ebfba692248691006d1da4fb77ee25f1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 07:19:24 -0400 Subject: [PATCH 002/168] r --- pytorch_lightning/core/step_result.py | 177 ++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 pytorch_lightning/core/step_result.py diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py new file mode 100644 index 0000000000000..ff347974b25e5 --- /dev/null +++ b/pytorch_lightning/core/step_result.py @@ -0,0 +1,177 @@ +from typing import Optional, Dict +from torch import Tensor +import torch + + +class Result(Dict): + + def __init__( + self, + minimize: Optional[Tensor] = None, + early_stop_on: Tensor = None, + checkpoint_on: Tensor = None, + hiddens: Optional[Tensor] = None + ): + + super().__init__() + + self.early_stop_on = early_stop_on + self.checkpoint_on = checkpoint_on + + self.hiddens = hiddens + self.minimize = minimize + + def log( + self, + name, + value, + prog_bar=False, + logger=True, + reduce_on_batch_end=False, + reduce_on_epoch_end=True, + reduce_fx=torch.mean + ): + if 'meta' not in self: + self.__setitem__('meta', {}) + self.__set_meta(name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx) + + # set the value + self.__setitem__(name, value) + + def __set_meta(self, name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx): + # set the meta for the item + meta_value = value + if isinstance(meta_value, torch.Tensor): + meta_value = meta_value.detach() + meta = dict( + prog_bar=prog_bar, + logger=logger, + reduce_on_batch_end=reduce_on_batch_end, + reduce_on_epoch_end=reduce_on_epoch_end, + reduce_fx=reduce_fx, + value=meta_value + ) + self['meta'][name] = meta + + @property + def hiddens(self): + return self._hiddens + + @hiddens.setter + def hiddens(self, x): + if x is not None: + assert isinstance(x, Tensor), 'hiddens must be a torch.Tensor' + self._hiddens = x + self.__setitem__('hiddens', x) + + @property + def checkpoint_on(self): + # use minimize as default if no checkpoint_on is passed + if 'checkpoint_on' not in self: + minimize = self.__getitem__('minimize') + self.__setitem__('checkpoint_on', minimize) + + return self.__getitem__('checkpoint_on') + + @checkpoint_on.setter + def checkpoint_on(self, x): + if x is not None: + assert isinstance(x, Tensor), 'checkpoint_on must be a torch.Tensor' + self.__setitem__('checkpoint_on', x.detach()) + + @property + def early_stop_on(self): + # use minimize as default if no checkpoint_on is passed + if 'early_stop_on' not in self: + minimize = self.__getitem__('minimize') + self.__setitem__('early_stop_on', minimize) + + return self.__getitem__('early_stop_on') + + @early_stop_on.setter + def early_stop_on(self, x): + if x is not None: + assert isinstance(x, Tensor), 'early_stop_on must be a torch.Tensor' + self.__setitem__('early_stop_on', x.detach()) + + @property + def minimize(self): + return self.__getitem__('minimize') + + @minimize.setter + def minimize(self, x): + if x is not None: + assert isinstance(x, Tensor), 'metric to minimize must be a torch.Tensor' + m = 'the metric to minimize must have a computational graph. Minimize ' \ + 'can only be used in training_end, training_step_end, training_epoch_end' + assert x.grad_fn is not None, m + self.__setitem__('minimize', x) + + def __repr__(self): + copy = self.copy() + del copy['meta'] + + return str(copy) + + def __str__(self): + copy = self.copy() + del copy['meta'] + + return str(copy) + + +class TrainResult(Result): + + def __init__( + self, + minimize: Optional[Tensor] = None, + early_stop_on: Tensor = None, + checkpoint_on: Tensor = None, + hiddens: Optional[Tensor] = None + ): + + super().__init__(minimize, early_stop_on, checkpoint_on, hiddens) + + def log( + self, + name, + value, + prog_bar=False, + logger=True, + reduce_on_batch_end=True, + reduce_on_epoch_end=False, + reduce_fx=torch.mean + ): + super().log(name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx) + + +class EvalResult(Result): + + def __init__( + self, + early_stop_on: Tensor = None, + checkpoint_on: Tensor = None, + hiddens: Optional[Tensor] = None + ): + + super().__init__(None, early_stop_on, checkpoint_on, hiddens) + + def log( + self, + name, + value, + prog_bar=False, + logger=True, + reduce_on_batch_end=False, + reduce_on_epoch_end=True, + reduce_fx=torch.mean + ): + super().log(name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx) + + +if __name__ == '__main__': + import torch + result = EvalResult() + result.log('some', 123) + print(result) + result.minimize = torch.tensor(1) \ No newline at end of file From 5cc01ffc36dc011389af2c6e17232e46a81131a8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 12:26:04 -0400 Subject: [PATCH 003/168] r --- pytorch_lightning/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 1413d8d62cc27..09783e6d18382 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -55,6 +55,7 @@ from pytorch_lightning.trainer import Trainer from pytorch_lightning.utilities.seed import seed_everything from pytorch_lightning import metrics + from pytorch_lightning.core.step_result import TrainResult, EvalResult __all__ = [ 'Trainer', @@ -62,7 +63,9 @@ 'Callback', 'data_loader', 'seed_everything', - 'metrics' + 'metrics', + 'EvalResult', + 'TrainResult' ] # necessary for regular bolts imports. Skip exception since bolts is not always installed From c747f802b138f0fbf95e0706d3cef5557610f38b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 13:12:28 -0400 Subject: [PATCH 004/168] patched optimizer closure with sr --- pytorch_lightning/core/step_result.py | 14 +++++++- pytorch_lightning/trainer/training_loop.py | 42 +++++++++++++++------- tests/base/deterministic_model.py | 12 +++++++ 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index ff347974b25e5..6e4f4552c5a23 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -1,6 +1,7 @@ from typing import Optional, Dict from torch import Tensor import torch +from copy import copy class Result(Dict): @@ -18,7 +19,7 @@ def __init__( self.early_stop_on = early_stop_on self.checkpoint_on = checkpoint_on - self.hiddens = hiddens + self._hiddens = hiddens self.minimize = minimize def log( @@ -107,6 +108,11 @@ def minimize(self, x): assert x.grad_fn is not None, m self.__setitem__('minimize', x) + def detach(self): + for k, v in self.items(): + if isinstance(v, torch.Tensor) and v.grad_fn is not None: + self.__setitem__(k, v.detach()) + def __repr__(self): copy = self.copy() del copy['meta'] @@ -119,6 +125,12 @@ def __str__(self): return str(copy) + def __copy__(self): + newone = type(self)() + for k, v in self.items(): + newone[k] = copy(v) + return newone + class TrainResult(Result): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index fa493f2e1b09a..dc069aad1b553 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -153,6 +153,7 @@ def training_step(self, batch, batch_idx): import torch from torch.utils.data import DataLoader import torch.distributed as torch_distrib +from copy import copy from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback @@ -164,6 +165,7 @@ def training_step(self, batch, batch_idx): from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict from pytorch_lightning.utilities.memory import recursive_detach +from pytorch_lightning.core.step_result import EvalResult, TrainResult, Result try: from apex import amp @@ -780,26 +782,38 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens) # ---------------------------- # format and reduce outputs accordingly training_step_output_for_epoch_end = training_step_output - training_step_output = self.process_output(training_step_output, train=True) - - # TODO: temporary part of structured results PR - training_step_output = AttributeDict( - batch_loss=training_step_output[0], - pbar_on_batch_end=training_step_output[1], - log_metrics=training_step_output[2], - callback_metrics=training_step_output[3], - hiddens=training_step_output[4], - ) + is_result_obj = isinstance(training_step_output, Result) + + # don't allow EvalResult in the training_step + if isinstance(training_step_output, EvalResult): + raise MisconfigurationException('training_step cannot return EvalResult, ' + 'use a dict or TrainResult instead') + + # handle regular dicts + if not is_result_obj: + training_step_output = self.process_output(training_step_output, train=True) + + training_step_output = AttributeDict( + batch_loss=training_step_output[0], + pbar_on_batch_end=training_step_output[1], + log_metrics=training_step_output[2], + callback_metrics=training_step_output[3], + hiddens=training_step_output[4], + ) # if the user decides to finally reduce things in epoch_end, save raw output without graphs if isinstance(training_step_output_for_epoch_end, torch.Tensor): training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach() + elif is_result_obj: + training_step_output_for_epoch_end = copy(training_step_output) + training_step_output_for_epoch_end.detach() else: training_step_output_for_epoch_end = recursive_detach(training_step_output_for_epoch_end) # accumulate loss # (if accumulate_grad_batches = 1 no effect) - closure_loss = training_step_output.batch_loss / self.accumulate_grad_batches + closure_loss = training_step_output.minimize if is_result_obj else training_step_output.batch_loss + closure_loss = closure_loss / self.accumulate_grad_batches # the loss will get scaled for amp. avoid any modifications to it untouched_loss = closure_loss.detach().clone() @@ -829,7 +843,11 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens) # once backward has been applied, release graph closure_loss = closure_loss.detach() - training_step_output.batch_loss = training_step_output.batch_loss.detach() + + if is_result_obj: + training_step_output.detach() + else: + training_step_output.batch_loss = training_step_output.batch_loss.detach() if self.use_horovod: # Synchronize Horovod to ensure gradient manipulations (e.g., loss scaling) are valid diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index a4988673c60a4..9315ef625a5ff 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -2,6 +2,7 @@ import torch from torch import nn from torch.utils.data import Dataset, DataLoader +from pytorch_lightning import TrainResult from pytorch_lightning.core.lightning import LightningModule @@ -97,6 +98,17 @@ def training_epoch_end_scalar(self, outputs): prototype_loss = outputs[0] return prototype_loss + # -------------------------- + # Result returns + # -------------------------- + def training_step_result_return(self, batch, batch_idx): + acc = self.step(batch, batch_idx) + + result = TrainResult(minimize=acc) + result.log('log_acc1', torch.tensor(12).type_as(acc), reduce_on_epoch_end=True) + + return result + # -------------------------- # dictionary returns # -------------------------- From ed9b4f8d1341c4795b98aa2f398548977265c446 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 13:47:51 -0400 Subject: [PATCH 005/168] patched optimizer closure with sr --- pytorch_lightning/core/step_result.py | 35 ++++++++++++++++++++++ pytorch_lightning/trainer/training_loop.py | 20 ++++++++++--- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 6e4f4552c5a23..bd2265f49cb6a 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -108,6 +108,41 @@ def minimize(self, x): assert x.grad_fn is not None, m self.__setitem__('minimize', x) + @property + def callback_metrics(self): + result = { + 'early_stop_on': self.early_stop_on, + 'checkpoint_on': self.checkpoint_on + } + + return result + + @property + def batch_log_metrics(self): + """ + Gets the metrics to log at the end of the batch step + """ + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if options['logger']: + result[k] = options['value'] + return result + + @property + def batch_pbar_metrics(self): + """ + Gets the metrics to log at the end of the batch step + """ + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if options['prog_bar']: + result[k] = options['value'] + return result + def detach(self): for k, v in self.items(): if isinstance(v, torch.Tensor) and v.grad_fn is not None: diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index dc069aad1b553..a061c55fb69a6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -624,7 +624,7 @@ def run_training_batch(self, batch, batch_idx): param.requires_grad = True # ------------------- - # calculate loss + # calculate loss (train step + train step end) # ------------------- opt_closure_result = self.optimizer_closure( split_batch, @@ -633,14 +633,26 @@ def run_training_batch(self, batch, batch_idx): optimizer, self.hiddens ) + is_result_obj = isinstance(opt_closure_result.training_step_output, Result) # ------------------------------ # POST forward bookkeeping # ------------------------------ batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics) - batch_log_metrics.append(opt_closure_result.training_step_output.log_metrics) - self.add_progress_bar_metrics(opt_closure_result.training_step_output.pbar_on_batch_end) + # add metrics to loggers + if is_result_obj: + metrics_to_log = opt_closure_result.training_step_output.batch_log_metrics + else: + metrics_to_log = opt_closure_result.training_step_output.log_metrics + batch_log_metrics.append(metrics_to_log) + + # add metrics to progress bar + if is_result_obj: + metrics_for_pbar = opt_closure_result.training_step_output.batch_pbar_metrics + else: + metrics_for_pbar = opt_closure_result.training_step_output.pbar_on_batch_end + self.add_progress_bar_metrics(metrics_for_pbar) # track hiddens self.hiddens = opt_closure_result.hiddens @@ -766,7 +778,7 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens) wrap the forward step in a closure so second order methods work """ # --------------------------- - # FORWARD + # FORWARD (TRAINING STEP + TRAIN STEP END) # --------------------------- with self.profiler.profile('model_forward'): if self.use_amp and NATIVE_AMP_AVALAIBLE and not self.use_tpu: From 3f98d18e7258d9951118677350da96548510d3f0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 13:49:20 -0400 Subject: [PATCH 006/168] patched optimizer closure with sr --- .../test_trainer_steps_result_return.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/trainer/test_trainer_steps_result_return.py diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py new file mode 100644 index 0000000000000..f610c64e1cef9 --- /dev/null +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -0,0 +1,49 @@ +""" +Tests to ensure that the training loop works with a dict +""" +from pytorch_lightning import Trainer +from tests.base.deterministic_model import DeterministicModel + + +def test_training_step_result(tmpdir): + """ + Tests that only training_step can be used + """ + model = DeterministicModel() + model.training_step = model.training_step_result_return + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + weights_summary=None, + ) + trainer.fit(model) + + # make sure correct steps were called + assert model.training_step_called + assert not model.training_step_end_called + assert not model.training_epoch_end_called + + # make sure training outputs what is expected + for batch_idx, batch in enumerate(model.train_dataloader()): + break + + out = trainer.run_training_batch(batch, batch_idx) + assert out.signal == 0 + assert out.batch_log_metrics['log_acc1'] == 12.0 + assert out.batch_log_metrics['log_acc2'] == 7.0 + + train_step_out = out.training_step_output_for_epoch_end + pbar_metrics = train_step_out['progress_bar'] + assert 'log' in train_step_out + assert 'progress_bar' in train_step_out + assert train_step_out['train_step_test'] == 549 + assert pbar_metrics['pbar_acc1'] == 17.0 + assert pbar_metrics['pbar_acc2'] == 19.0 + + # make sure the optimizer closure returns the correct things + opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) + assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + +test_training_step_result('') \ No newline at end of file From 8352a56bab9c0b827864ce9e1c26b058b9395d92 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 14:11:01 -0400 Subject: [PATCH 007/168] added train step structured result --- pytorch_lightning/trainer/logging.py | 4 ++ pytorch_lightning/trainer/trainer.py | 3 ++ tests/base/deterministic_model.py | 5 ++- .../test_trainer_steps_result_return.py | 37 ++++++++++++++----- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index 35f5d5d35b9ca..17b48aeface66 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -1,3 +1,4 @@ +import os from abc import ABC from typing import Union, Iterable @@ -73,6 +74,9 @@ def log_metrics(self, metrics, grad_norm_dic, step=None): self.logger.agg_and_log_metrics(scalar_metrics, step=step) self.logger.save() + if 'PL_DEV_DEBUG' in os.environ: + self.debug_logged_metrics.append(scalar_metrics) + def add_progress_bar_metrics(self, metrics): for k, v in metrics.items(): if isinstance(v, torch.Tensor): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 1f611ab7ac57c..b626280f17b5e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -616,6 +616,9 @@ def __init__( self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') + # for debugging purposes only, track the logged metrics + self.debug_logged_metrics = [] + # Callback system self.on_init_end() diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 9315ef625a5ff..e55d2dff33385 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -105,8 +105,11 @@ def training_step_result_return(self, batch, batch_idx): acc = self.step(batch, batch_idx) result = TrainResult(minimize=acc) - result.log('log_acc1', torch.tensor(12).type_as(acc), reduce_on_epoch_end=True) + result.log('log_and_pbar_acc1', torch.tensor(12).type_as(acc), reduce_on_epoch_end=True, prog_bar=True) + result.log('log_acc2', torch.tensor(7).type_as(acc), reduce_on_epoch_end=True) + result.log('pbar_acc3', torch.tensor(17).type_as(acc), reduce_on_epoch_end=True, logger=False, prog_bar=True) + self.training_step_called = True return result # -------------------------- diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index f610c64e1cef9..891a4a4311747 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -1,14 +1,19 @@ """ Tests to ensure that the training loop works with a dict """ +import os from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel +from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult def test_training_step_result(tmpdir): """ Tests that only training_step can be used """ + # enable internal debugging actions + os.environ['PL_DEV_DEBUG'] = '1' + model = DeterministicModel() model.training_step = model.training_step_result_return model.val_dataloader = None @@ -31,19 +36,33 @@ def test_training_step_result(tmpdir): out = trainer.run_training_batch(batch, batch_idx) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 12.0 + assert out.batch_log_metrics['log_and_pbar_acc1'] == 12.0 assert out.batch_log_metrics['log_acc2'] == 7.0 train_step_out = out.training_step_output_for_epoch_end - pbar_metrics = train_step_out['progress_bar'] - assert 'log' in train_step_out - assert 'progress_bar' in train_step_out - assert train_step_out['train_step_test'] == 549 - assert pbar_metrics['pbar_acc1'] == 17.0 - assert pbar_metrics['pbar_acc2'] == 19.0 + assert isinstance(train_step_out, TrainResult) + + assert 'minimize' in train_step_out + assert 'log_and_pbar_acc1' in train_step_out + assert 'log_acc2' in train_step_out + + # make sure we are using the correct metrics for callbacks + assert trainer.callback_metrics['early_stop_on'] == 171 + assert trainer.callback_metrics['checkpoint_on'] == 171 + + # make sure pbar metrics are correct + assert trainer.progress_bar_metrics['log_and_pbar_acc1'] == 12 + assert trainer.progress_bar_metrics['pbar_acc3'] == 17 + assert 'log_acc2' not in trainer.progress_bar_metrics + + # make sure correct metrics are logged + assert len(trainer.debug_logged_metrics) == 1 + logged_metrics = trainer.debug_logged_metrics[0] + assert logged_metrics['log_and_pbar_acc1'] == 12.0 + assert logged_metrics['log_acc2'] == 7.0 + assert 'pbar_acc3' not in logged_metrics + assert len(logged_metrics) == 3 # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) - -test_training_step_result('') \ No newline at end of file From 7d453d4f3234e7144bffbdf18bb11d6fa37b9c01 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 14:11:31 -0400 Subject: [PATCH 008/168] added train step structured result --- tests/trainer/test_trainer_steps_result_return.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 891a4a4311747..961f5adfc17d0 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -9,7 +9,8 @@ def test_training_step_result(tmpdir): """ - Tests that only training_step can be used + Tests that only training_step can be used with TrainResult + Makes sure that things are routed to pbar, loggers and loss accordingly """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' From 23403ce18430b1781b8feb80e4332beb26dc7e4a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 14:12:31 -0400 Subject: [PATCH 009/168] added train step structured result --- tests/trainer/test_trainer_steps_result_return.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 961f5adfc17d0..ada6ffe1916fb 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -67,3 +67,11 @@ def test_training_step_result(tmpdir): # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + + +def test_training_step_epoch_end_result(tmpdir): + """ + Makes sure training_step and epoch_end can be used with Results (without batch_end) + """ + # TODO: implement + pass From 9bc77ac10fbad6a15c24a2447c08f9908b0a74cb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 15 Jul 2020 14:27:08 -0400 Subject: [PATCH 010/168] added train step structured result --- tests/trainer/test_trainer_steps_result_return.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index ada6ffe1916fb..9db8fae8893d7 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -7,6 +7,12 @@ from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult +# TODOs: +# make checkpoint and early stopping use the correct metrics +# make sure step_ends receive a plain dict +# same for epoch_end +# make sure to auto-reduce when no epoch_end is implemented + def test_training_step_result(tmpdir): """ Tests that only training_step can be used with TrainResult From 9cdaf8fb396e5133bea13c4503d9ac16bc09f5b4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 16 Jul 2020 06:54:15 -0400 Subject: [PATCH 011/168] added train step structured result --- pytorch_lightning/core/step_result.py | 54 +++++++++++++++++-- pytorch_lightning/trainer/training_loop.py | 4 ++ tests/base/deterministic_model.py | 19 +++++++ .../test_trainer_steps_result_return.py | 25 ++++++++- 4 files changed, 97 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index bd2265f49cb6a..5847cce9cb184 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -22,6 +22,15 @@ def __init__( self._hiddens = hiddens self.minimize = minimize + def __getattr__(self, key): + try: + return self[key] + except KeyError: + raise AttributeError(f'Missing attribute "{key}"') + + def __setattr__(self, key, val): + self[key] = val + def log( self, name, @@ -149,10 +158,12 @@ def detach(self): self.__setitem__(k, v.detach()) def __repr__(self): - copy = self.copy() - del copy['meta'] + self_copy = self.copy() - return str(copy) + if 'meta' in self_copy: + del self_copy['meta'] + + return str(self_copy) def __str__(self): copy = self.copy() @@ -166,6 +177,42 @@ def __copy__(self): newone[k] = copy(v) return newone + @classmethod + def gather(cls, outputs): + meta = outputs[0]['meta'] + result = Result() + result = recursive_gather(outputs, result) + recursive_stack(result) + result['meta'] = meta + return result + + +def recursive_gather(outputs, result=None): + for out in outputs: + if 'meta' in out: + del out['meta'] + + for k, v in out.items(): + if isinstance(v, dict): + v = recursive_gather([v], result) + + if k not in result: + result[k] = [] + + result[k].append(v) + + return result + + +def recursive_stack(result): + for k, v in result.items(): + if isinstance(v, dict): + recursive_stack(v) + + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], torch.Tensor): + v = torch.stack(v) + result[k] = v + class TrainResult(Result): @@ -219,6 +266,7 @@ def log( if __name__ == '__main__': import torch result = EvalResult() + result.minimize = 2 result.log('some', 123) print(result) result.minimize = torch.tensor(1) \ No newline at end of file diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index a061c55fb69a6..58540af8d1d85 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -531,6 +531,10 @@ def run_training_epoch_end(self, epoch_output): model = self.get_model() if self.is_overridden('training_epoch_end', model=model): self.global_step += 1 + + if isinstance(epoch_output[0], Result): + epoch_output = Result.gather(epoch_output) + epoch_output = model.training_epoch_end(epoch_output) _processed_outputs = self.process_output(epoch_output) log_epoch_metrics = _processed_outputs[2] diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index e55d2dff33385..b46f18e7b9c4d 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -112,6 +112,25 @@ def training_step_result_return(self, batch, batch_idx): self.training_step_called = True return result + def training_epoch_end_return(self, outputs): + """ + There should be an array of scalars without graphs that are all 171 (4 of them) + """ + self.training_epoch_end_called = True + + if self.use_dp or self.use_ddp2: + pass + else: + # only saw 4 batches + assert len(outputs) == 4 + for batch_out in outputs: + assert batch_out == 171 + assert batch_out.grad_fn is None + assert isinstance(batch_out, torch.Tensor) + + prototype_loss = outputs[0] + return prototype_loss + # -------------------------- # dictionary returns # -------------------------- diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 9db8fae8893d7..b532ed7729e7c 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -74,10 +74,31 @@ def test_training_step_result(tmpdir): opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + # TODO: test that it gets reduced on epoch end + # TODO: test that on batch end gets reduced + def test_training_step_epoch_end_result(tmpdir): """ Makes sure training_step and epoch_end can be used with Results (without batch_end) """ - # TODO: implement - pass + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_result_return + model.training_epoch_end = model.training_epoch_end_return + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + weights_summary=None, + ) + trainer.fit(model) + + # make sure correct steps were called + assert model.training_step_called + assert not model.training_step_end_called + assert model.training_epoch_end_called + +test_training_step_epoch_end_result('') \ No newline at end of file From 9309b9e1cd115941940f62323e1f5ad9d9d99395 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 16 Jul 2020 07:19:36 -0400 Subject: [PATCH 012/168] added train step structured result --- pytorch_lightning/core/step_result.py | 104 +++++++++++--------------- 1 file changed, 43 insertions(+), 61 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 5847cce9cb184..3178f4c9f86ed 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -19,18 +19,57 @@ def __init__( self.early_stop_on = early_stop_on self.checkpoint_on = checkpoint_on - self._hiddens = hiddens + self.hiddens = hiddens self.minimize = minimize + if minimize is not None and early_stop_on is None: + self.early_stop_on = minimize.detach() + if minimize is not None and checkpoint_on is None: + self.checkpoint_on = minimize.detach() + def __getattr__(self, key): try: - return self[key] + if key == 'callback_metrics': + return self.callback_metrics() + elif key == 'batch_log_metrics': + return self.batch_log_metrics() + elif key == 'batch_pbar_metrics': + return self.batch_pbar_metrics() + else: + return self[key] except KeyError: raise AttributeError(f'Missing attribute "{key}"') def __setattr__(self, key, val): + # ensure reserve keys are tensors and detached + if key in {'hiddens', 'checkpoint_on', 'early_stop_on'}: + self._assert_tensor_metric(key, val) + val = val.detach() + + # ensure minimize is a tensor and has grads + elif key == 'minimize': + err = 'Minimize can only be used in training_end, training_step_end, training_epoch_end' + self._assert_grad_tensor_metric(key, val, err) + + # ensure anything else that is a tensor is detached + elif isinstance(val, torch.Tensor): + val = val.detach() + self[key] = val + def _assert_tensor_metric(self, name, x): + if x is not None: + assert isinstance(x, Tensor), f'{name} must be a torch.Tensor' + + def _assert_grad_tensor_metric(self, name, x, additional_err: str = None): + if x is not None: + assert isinstance(x, Tensor), f'{name} must be a torch.Tensor' + m = f'{name} must have a computational graph.' + + if additional_err: + m += f' {additional_err}' + assert x.grad_fn is not None, m + def log( self, name, @@ -63,61 +102,6 @@ def __set_meta(self, name, value, prog_bar, logger, reduce_on_batch_end, reduce_ ) self['meta'][name] = meta - @property - def hiddens(self): - return self._hiddens - - @hiddens.setter - def hiddens(self, x): - if x is not None: - assert isinstance(x, Tensor), 'hiddens must be a torch.Tensor' - self._hiddens = x - self.__setitem__('hiddens', x) - - @property - def checkpoint_on(self): - # use minimize as default if no checkpoint_on is passed - if 'checkpoint_on' not in self: - minimize = self.__getitem__('minimize') - self.__setitem__('checkpoint_on', minimize) - - return self.__getitem__('checkpoint_on') - - @checkpoint_on.setter - def checkpoint_on(self, x): - if x is not None: - assert isinstance(x, Tensor), 'checkpoint_on must be a torch.Tensor' - self.__setitem__('checkpoint_on', x.detach()) - - @property - def early_stop_on(self): - # use minimize as default if no checkpoint_on is passed - if 'early_stop_on' not in self: - minimize = self.__getitem__('minimize') - self.__setitem__('early_stop_on', minimize) - - return self.__getitem__('early_stop_on') - - @early_stop_on.setter - def early_stop_on(self, x): - if x is not None: - assert isinstance(x, Tensor), 'early_stop_on must be a torch.Tensor' - self.__setitem__('early_stop_on', x.detach()) - - @property - def minimize(self): - return self.__getitem__('minimize') - - @minimize.setter - def minimize(self, x): - if x is not None: - assert isinstance(x, Tensor), 'metric to minimize must be a torch.Tensor' - m = 'the metric to minimize must have a computational graph. Minimize ' \ - 'can only be used in training_end, training_step_end, training_epoch_end' - assert x.grad_fn is not None, m - self.__setitem__('minimize', x) - - @property def callback_metrics(self): result = { 'early_stop_on': self.early_stop_on, @@ -126,7 +110,6 @@ def callback_metrics(self): return result - @property def batch_log_metrics(self): """ Gets the metrics to log at the end of the batch step @@ -139,7 +122,6 @@ def batch_log_metrics(self): result[k] = options['value'] return result - @property def batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step @@ -265,8 +247,8 @@ def log( if __name__ == '__main__': import torch - result = EvalResult() - result.minimize = 2 + result = TrainResult() + result.hiddens = torch.tensor(1) result.log('some', 123) print(result) result.minimize = torch.tensor(1) \ No newline at end of file From 824113054835b3e0c865103301da700a8e56abd6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 16 Jul 2020 07:29:04 -0400 Subject: [PATCH 013/168] added train step structured result --- pytorch_lightning/core/step_result.py | 30 +++++++++++++++------------ 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 3178f4c9f86ed..ec8e8459224de 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -16,11 +16,14 @@ def __init__( super().__init__() - self.early_stop_on = early_stop_on - self.checkpoint_on = checkpoint_on - - self.hiddens = hiddens - self.minimize = minimize + if early_stop_on is not None: + self.early_stop_on = early_stop_on + if checkpoint_on is not None: + self.checkpoint_on = checkpoint_on + if hiddens is not None: + self.hiddens = hiddens + if minimize is not None: + self.minimize = minimize if minimize is not None and early_stop_on is None: self.early_stop_on = minimize.detach() @@ -30,21 +33,22 @@ def __init__( def __getattr__(self, key): try: if key == 'callback_metrics': - return self.callback_metrics() + return self.get_callback_metrics() elif key == 'batch_log_metrics': - return self.batch_log_metrics() + return self.get_batch_log_metrics() elif key == 'batch_pbar_metrics': - return self.batch_pbar_metrics() + return self.get_batch_pbar_metrics() else: return self[key] except KeyError: - raise AttributeError(f'Missing attribute "{key}"') + return None def __setattr__(self, key, val): # ensure reserve keys are tensors and detached if key in {'hiddens', 'checkpoint_on', 'early_stop_on'}: self._assert_tensor_metric(key, val) - val = val.detach() + if val is not None: + val = val.detach() # ensure minimize is a tensor and has grads elif key == 'minimize': @@ -102,7 +106,7 @@ def __set_meta(self, name, value, prog_bar, logger, reduce_on_batch_end, reduce_ ) self['meta'][name] = meta - def callback_metrics(self): + def get_callback_metrics(self): result = { 'early_stop_on': self.early_stop_on, 'checkpoint_on': self.checkpoint_on @@ -110,7 +114,7 @@ def callback_metrics(self): return result - def batch_log_metrics(self): + def get_batch_log_metrics(self): """ Gets the metrics to log at the end of the batch step """ @@ -122,7 +126,7 @@ def batch_log_metrics(self): result[k] = options['value'] return result - def batch_pbar_metrics(self): + def get_batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ From ceeedc21767bbb4a9be42b77917518bf9ff1d6ef Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 16 Jul 2020 07:33:52 -0400 Subject: [PATCH 014/168] added train step structured result --- pytorch_lightning/core/step_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index ec8e8459224de..7883335a02aba 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -140,7 +140,7 @@ def get_batch_pbar_metrics(self): def detach(self): for k, v in self.items(): - if isinstance(v, torch.Tensor) and v.grad_fn is not None: + if isinstance(v, torch.Tensor): self.__setitem__(k, v.detach()) def __repr__(self): From 6bbe6d83e84c455ce1b2e199fc3a17f8913ab3fa Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 16 Jul 2020 08:08:15 -0400 Subject: [PATCH 015/168] added train step structured result --- pytorch_lightning/core/step_result.py | 2 +- pytorch_lightning/trainer/training_loop.py | 22 +++++++++++++++------- tests/base/deterministic_model.py | 19 ++++++++++--------- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 7883335a02aba..ece9d34c30187 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -166,7 +166,7 @@ def __copy__(self): @classmethod def gather(cls, outputs): meta = outputs[0]['meta'] - result = Result() + result = cls() result = recursive_gather(outputs, result) recursive_stack(result) result['meta'] = meta diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 58540af8d1d85..1375e848ad24b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -533,21 +533,29 @@ def run_training_epoch_end(self, epoch_output): self.global_step += 1 if isinstance(epoch_output[0], Result): - epoch_output = Result.gather(epoch_output) + epoch_output = epoch_output[0].__class__.gather(epoch_output) epoch_output = model.training_epoch_end(epoch_output) - _processed_outputs = self.process_output(epoch_output) - log_epoch_metrics = _processed_outputs[2] - callback_epoch_metrics = _processed_outputs[3] + if isinstance(epoch_output, Result): + epoch_log_metrics = epoch_output.epoch_log_metrics + epoch_progress_bar_metrics = epoch_output.epoch_progress_bar_metrics + epoch_callback_metrics = epoch_output.epoch_callback_metrics + else: + _processed_outputs = self.process_output(epoch_output) + epoch_progress_bar_metrics = _processed_outputs[1] + epoch_log_metrics = _processed_outputs[2] + epoch_callback_metrics = _processed_outputs[3] + + # TODO: do all of this for the user when no training_epoch end is defined and they used a result # add the metrics to the loggers - self.log_metrics(log_epoch_metrics, {}) + self.log_metrics(epoch_log_metrics, {}) # add metrics to callbacks - self.callback_metrics.update(callback_epoch_metrics) + self.callback_metrics.update(epoch_callback_metrics) # add metrics to progress_bar - self.add_progress_bar_metrics(_processed_outputs[1]) + self.add_progress_bar_metrics(epoch_progress_bar_metrics) def sync_horovod(self): if self.use_horovod: diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index b46f18e7b9c4d..bc70652dd49f8 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -112,7 +112,7 @@ def training_step_result_return(self, batch, batch_idx): self.training_step_called = True return result - def training_epoch_end_return(self, outputs): + def training_epoch_end_return(self, result): """ There should be an array of scalars without graphs that are all 171 (4 of them) """ @@ -122,14 +122,15 @@ def training_epoch_end_return(self, outputs): pass else: # only saw 4 batches - assert len(outputs) == 4 - for batch_out in outputs: - assert batch_out == 171 - assert batch_out.grad_fn is None - assert isinstance(batch_out, torch.Tensor) - - prototype_loss = outputs[0] - return prototype_loss + assert isinstance(result, TrainResult) + assert len(result.minimize) == 4 + assert self.count_num_graphs(result) == 0 + assert result.minimize.mean() == 171 + + result.log_acc2 = result.log_acc2.mean() + result.log_and_pbar_acc1 = result.log_and_pbar_acc1.mean() + result.pbar_acc3 = result.pbar_acc3.mean() + return result # -------------------------- # dictionary returns From c56ea84c0dd7d8b25e87a8ae693f4c58dc0ef1a2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 16 Jul 2020 22:03:44 -0400 Subject: [PATCH 016/168] added train step structured result --- pytorch_lightning/core/step_result.py | 23 ++++++++++++++++------ pytorch_lightning/trainer/training_loop.py | 12 +++++++++++ tests/base/deterministic_model.py | 3 --- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index ece9d34c30187..58cb7d75da61a 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -23,6 +23,8 @@ def __init__( if hiddens is not None: self.hiddens = hiddens if minimize is not None: + err = 'Minimize can only be used in training_end, training_step_end, training_epoch_end' + self._assert_grad_tensor_metric('minimize', minimize, err) self.minimize = minimize if minimize is not None and early_stop_on is None: @@ -38,6 +40,8 @@ def __getattr__(self, key): return self.get_batch_log_metrics() elif key == 'batch_pbar_metrics': return self.get_batch_pbar_metrics() + elif key == 'epoch_log_metrics': + return self.get_epoch_log_metrics() else: return self[key] except KeyError: @@ -50,13 +54,8 @@ def __setattr__(self, key, val): if val is not None: val = val.detach() - # ensure minimize is a tensor and has grads - elif key == 'minimize': - err = 'Minimize can only be used in training_end, training_step_end, training_epoch_end' - self._assert_grad_tensor_metric(key, val, err) - # ensure anything else that is a tensor is detached - elif isinstance(val, torch.Tensor): + elif isinstance(val, torch.Tensor) and key != 'minimize': val = val.detach() self[key] = val @@ -126,6 +125,18 @@ def get_batch_log_metrics(self): result[k] = options['value'] return result + def get_epoch_log_metrics(self): + """ + Gets the metrics to log at the end of the batch step + """ + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if options['logger']: + result[k] = options['value'] + return result + def get_batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 1375e848ad24b..f1128062ff153 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -532,11 +532,23 @@ def run_training_epoch_end(self, epoch_output): if self.is_overridden('training_epoch_end', model=model): self.global_step += 1 + # remove the protected keys so the user doesn't have to deal with them if isinstance(epoch_output[0], Result): epoch_output = epoch_output[0].__class__.gather(epoch_output) + minimize = epoch_output.minimize + early_stop_on = epoch_output.early_stop_on + checkpoint_on = epoch_output.checkpoint_on + del epoch_output['minimize'] + del epoch_output['early_stop_on'] + del epoch_output['checkpoint_on'] epoch_output = model.training_epoch_end(epoch_output) + if isinstance(epoch_output, Result): + epoch_output.minimize = minimize.mean() + epoch_output.early_stop_on = early_stop_on.mean() + epoch_output.checkpoint_on = checkpoint_on.mean() + if isinstance(epoch_output, Result): epoch_log_metrics = epoch_output.epoch_log_metrics epoch_progress_bar_metrics = epoch_output.epoch_progress_bar_metrics diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index bc70652dd49f8..b617935417bdc 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -123,9 +123,6 @@ def training_epoch_end_return(self, result): else: # only saw 4 batches assert isinstance(result, TrainResult) - assert len(result.minimize) == 4 - assert self.count_num_graphs(result) == 0 - assert result.minimize.mean() == 171 result.log_acc2 = result.log_acc2.mean() result.log_and_pbar_acc1 = result.log_and_pbar_acc1.mean() From 9df0e16e19392e6805c69ac51291e20022543ce5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 10:53:08 -0400 Subject: [PATCH 017/168] added train step structured result --- pytorch_lightning/trainer/training_loop.py | 41 ++++++++++++++++------ 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index f1128062ff153..9fff1618243e4 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -529,11 +529,20 @@ def run_on_epoch_end_hook(self, model): def run_training_epoch_end(self, epoch_output): model = self.get_model() + is_result_obj = isinstance(epoch_output[0], Result) + + epoch_log_metrics = {} + epoch_callback_metrics = {} + epoch_progress_bar_metrics = {} + + # -------------------------- + # EPOCH END STEP IF DEFINED + # -------------------------- if self.is_overridden('training_epoch_end', model=model): self.global_step += 1 # remove the protected keys so the user doesn't have to deal with them - if isinstance(epoch_output[0], Result): + if is_result_obj: epoch_output = epoch_output[0].__class__.gather(epoch_output) minimize = epoch_output.minimize early_stop_on = epoch_output.early_stop_on @@ -542,8 +551,10 @@ def run_training_epoch_end(self, epoch_output): del epoch_output['early_stop_on'] del epoch_output['checkpoint_on'] + # run training_epoch_end epoch_output = model.training_epoch_end(epoch_output) + # with a result we put back the main metrics and compute means if isinstance(epoch_output, Result): epoch_output.minimize = minimize.mean() epoch_output.early_stop_on = early_stop_on.mean() @@ -559,15 +570,25 @@ def run_training_epoch_end(self, epoch_output): epoch_log_metrics = _processed_outputs[2] epoch_callback_metrics = _processed_outputs[3] - # TODO: do all of this for the user when no training_epoch end is defined and they used a result - # add the metrics to the loggers - self.log_metrics(epoch_log_metrics, {}) - - # add metrics to callbacks - self.callback_metrics.update(epoch_callback_metrics) - - # add metrics to progress_bar - self.add_progress_bar_metrics(epoch_progress_bar_metrics) + # -------------------------- + # Structured Result (auto epoch end) + # -------------------------- + elif is_result_obj: + # TODO: reduce outputs for user + pass + + # -------------------------- + # track results + # -------------------------- + # TODO: do all of this for the user when no training_epoch end is defined and they used a result + # add the metrics to the loggers + self.log_metrics(epoch_log_metrics, {}) + + # add metrics to callbacks + self.callback_metrics.update(epoch_callback_metrics) + + # add metrics to progress_bar + self.add_progress_bar_metrics(epoch_progress_bar_metrics) def sync_horovod(self): if self.use_horovod: From 331fe558338f62f88abf8d4c061b8de8218d7f82 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 11:01:52 -0400 Subject: [PATCH 018/168] added train step structured result --- pytorch_lightning/core/step_result.py | 14 ++++++++++++++ pytorch_lightning/trainer/training_loop.py | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 58cb7d75da61a..73560dcc74da9 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -42,6 +42,8 @@ def __getattr__(self, key): return self.get_batch_pbar_metrics() elif key == 'epoch_log_metrics': return self.get_epoch_log_metrics() + elif key == 'epoch_pbar_metrics': + return self.get_epoch_pbar_metrics() else: return self[key] except KeyError: @@ -137,6 +139,18 @@ def get_epoch_log_metrics(self): result[k] = options['value'] return result + def get_epoch_pbar_metrics(self): + """ + Gets the metrics to log at the end of the batch step + """ + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if options['prog_bar']: + result[k] = options['value'] + return result + def get_batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 9fff1618243e4..4a0c34366bf2d 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -562,8 +562,8 @@ def run_training_epoch_end(self, epoch_output): if isinstance(epoch_output, Result): epoch_log_metrics = epoch_output.epoch_log_metrics - epoch_progress_bar_metrics = epoch_output.epoch_progress_bar_metrics - epoch_callback_metrics = epoch_output.epoch_callback_metrics + epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics + epoch_callback_metrics = epoch_output.callback_metrics else: _processed_outputs = self.process_output(epoch_output) epoch_progress_bar_metrics = _processed_outputs[1] From 0c8afc08966233ba9a290670aab8ea4ff2524427 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 11:15:44 -0400 Subject: [PATCH 019/168] added train step structured result --- pytorch_lightning/core/step_result.py | 8 ++++---- tests/base/deterministic_model.py | 9 ++++++--- tests/trainer/test_trainer_steps_result_return.py | 12 ++++++++++++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 73560dcc74da9..6ca4944cc6e57 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -124,7 +124,7 @@ def get_batch_log_metrics(self): meta = self['meta'] for k, options in meta.items(): if options['logger']: - result[k] = options['value'] + result[k] = self[k] return result def get_epoch_log_metrics(self): @@ -136,7 +136,7 @@ def get_epoch_log_metrics(self): meta = self['meta'] for k, options in meta.items(): if options['logger']: - result[k] = options['value'] + result[k] = self[k] return result def get_epoch_pbar_metrics(self): @@ -148,7 +148,7 @@ def get_epoch_pbar_metrics(self): meta = self['meta'] for k, options in meta.items(): if options['prog_bar']: - result[k] = options['value'] + result[k] = self[k] return result def get_batch_pbar_metrics(self): @@ -160,7 +160,7 @@ def get_batch_pbar_metrics(self): meta = self['meta'] for k, options in meta.items(): if options['prog_bar']: - result[k] = options['value'] + result[k] = self[k] return result def detach(self): diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index b617935417bdc..cc5c366c7c8ec 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -124,9 +124,12 @@ def training_epoch_end_return(self, result): # only saw 4 batches assert isinstance(result, TrainResult) - result.log_acc2 = result.log_acc2.mean() - result.log_and_pbar_acc1 = result.log_and_pbar_acc1.mean() - result.pbar_acc3 = result.pbar_acc3.mean() + result.log_acc2 = result.log_acc2.mean() + 11 + result.log_and_pbar_acc1 = result.log_and_pbar_acc1.mean() + 11 + result.pbar_acc3 = result.pbar_acc3.mean() + 11 + result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.pbar_acc3), logger=True) + result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.pbar_acc3), logger=False, prog_bar=True) + result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.pbar_acc3), logger=True, prog_bar=True) return result # -------------------------- diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index b532ed7729e7c..8c83fa83276ba 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -101,4 +101,16 @@ def test_training_step_epoch_end_result(tmpdir): assert not model.training_step_end_called assert model.training_epoch_end_called + # make sure correct metrics were logged + logged_metrics = trainer.debug_logged_metrics[-1] + assert logged_metrics['log_and_pbar_acc1'] == 23.0 + assert logged_metrics['log_acc2'] == 18.0 + assert logged_metrics['epoch_end_log_acc'] == 1212.0 + assert logged_metrics['epoch_end_log_pbar_acc'] == 1214.0 + assert 'epoch_end_pbar_acc' not in logged_metrics + + assert trainer.callback_metrics['early_stop_on'] == 171 + assert trainer.callback_metrics['checkpoint_on'] == 171 + + test_training_step_epoch_end_result('') \ No newline at end of file From 7f8d72d9fa145a9e82b5fd3a86f5bfdb9fe4612f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 11:17:58 -0400 Subject: [PATCH 020/168] added train step structured result --- tests/trainer/test_trainer_steps_result_return.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 8c83fa83276ba..69003a40244f3 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -109,6 +109,15 @@ def test_training_step_epoch_end_result(tmpdir): assert logged_metrics['epoch_end_log_pbar_acc'] == 1214.0 assert 'epoch_end_pbar_acc' not in logged_metrics + # make sure pbar metrics are correct + assert trainer.progress_bar_metrics['log_and_pbar_acc1'] == 23.0 + assert trainer.progress_bar_metrics['pbar_acc3'] == 28.0 + assert trainer.progress_bar_metrics['epoch_end_pbar_acc'] == 1213.0 + assert trainer.progress_bar_metrics['epoch_end_log_pbar_acc'] == 1214.0 + assert 'epoch_end_log_acc' not in trainer.progress_bar_metrics + assert 'log_acc2' not in trainer.progress_bar_metrics + + # make sure callback metrics didn't change assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 From 8254f8e382e0c7826c9323539ac2389c02b61d7a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 11:18:20 -0400 Subject: [PATCH 021/168] added train step structured result --- tests/trainer/test_trainer_steps_result_return.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 69003a40244f3..335981314c13f 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -120,6 +120,3 @@ def test_training_step_epoch_end_result(tmpdir): # make sure callback metrics didn't change assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 - - -test_training_step_epoch_end_result('') \ No newline at end of file From 7c8a32e72d1be64047d1e6b62b12602de4794cba Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 12:38:04 -0400 Subject: [PATCH 022/168] added train step structured result --- pytorch_lightning/core/step_result.py | 64 ++++++++++++++---- pytorch_lightning/trainer/training_loop.py | 16 +++-- tests/base/deterministic_model.py | 31 +++++++-- .../test_trainer_steps_result_return.py | 65 ++++++++++++------- 4 files changed, 132 insertions(+), 44 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 6ca4944cc6e57..b13bffa29fe25 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -32,6 +32,12 @@ def __init__( if minimize is not None and checkpoint_on is None: self.checkpoint_on = minimize.detach() + self['meta'] = { + '_internal': { + '_reduce_on_epoch': False + } + } + def __getattr__(self, key): try: if key == 'callback_metrics': @@ -81,18 +87,18 @@ def log( value, prog_bar=False, logger=True, - reduce_on_batch_end=False, - reduce_on_epoch_end=True, + on_step=False, + on_epoch=True, reduce_fx=torch.mean ): if 'meta' not in self: self.__setitem__('meta', {}) - self.__set_meta(name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx) + self.__set_meta(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx) # set the value self.__setitem__(name, value) - def __set_meta(self, name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx): + def __set_meta(self, name, value, prog_bar, logger, on_step, on_epoch, reduce_fx): # set the meta for the item meta_value = value if isinstance(meta_value, torch.Tensor): @@ -100,13 +106,17 @@ def __set_meta(self, name, value, prog_bar, logger, reduce_on_batch_end, reduce_ meta = dict( prog_bar=prog_bar, logger=logger, - reduce_on_batch_end=reduce_on_batch_end, - reduce_on_epoch_end=reduce_on_epoch_end, + on_step=on_step, + on_epoch=on_epoch, reduce_fx=reduce_fx, value=meta_value ) self['meta'][name] = meta + # track whether any input requires reduction on epoch end + internal = self['meta']['_internal'] + internal['_reduce_on_epoch'] = max(internal['_reduce_on_epoch'], on_epoch) + def get_callback_metrics(self): result = { 'early_stop_on': self.early_stop_on, @@ -123,6 +133,8 @@ def get_batch_log_metrics(self): meta = self['meta'] for k, options in meta.items(): + if k == '_internal': + continue if options['logger']: result[k] = self[k] return result @@ -135,6 +147,8 @@ def get_epoch_log_metrics(self): meta = self['meta'] for k, options in meta.items(): + if k == '_internal': + continue if options['logger']: result[k] = self[k] return result @@ -147,6 +161,8 @@ def get_epoch_pbar_metrics(self): meta = self['meta'] for k, options in meta.items(): + if k == '_internal': + continue if options['prog_bar']: result[k] = self[k] return result @@ -159,6 +175,8 @@ def get_batch_pbar_metrics(self): meta = self['meta'] for k, options in meta.items(): + if k == '_internal': + continue if options['prog_bar']: result[k] = self[k] return result @@ -197,6 +215,28 @@ def gather(cls, outputs): result['meta'] = meta return result + @classmethod + def reduce_on_epoch_end(cls, outputs): + meta = outputs[0]['meta'] + result = cls() + result = recursive_gather(outputs, result) + recursive_stack(result) + + for k, option in meta.items(): + if k == '_internal': + continue + + if option['on_epoch']: + fx = option['reduce_fx'] + result[k] = fx(result[k]) + + result['meta'] = meta + return result + + @property + def should_reduce_on_epoch_end(self): + return self['meta']['_internal']['_reduce_on_epoch'] + def recursive_gather(outputs, result=None): for out in outputs: @@ -243,11 +283,11 @@ def log( value, prog_bar=False, logger=True, - reduce_on_batch_end=True, - reduce_on_epoch_end=False, + on_step=True, + on_epoch=False, reduce_fx=torch.mean ): - super().log(name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx) + super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx) class EvalResult(Result): @@ -267,11 +307,11 @@ def log( value, prog_bar=False, logger=True, - reduce_on_batch_end=False, - reduce_on_epoch_end=True, + on_step=False, + on_epoch=True, reduce_fx=torch.mean ): - super().log(name, value, prog_bar, logger, reduce_on_batch_end, reduce_on_epoch_end, reduce_fx) + super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx) if __name__ == '__main__': diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 4a0c34366bf2d..b2831859dfd1b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -455,7 +455,9 @@ def run_training_epoch(self): # only track outputs when user implements training_epoch_end # otherwise we will build up unnecessary memory - if self.is_overridden('training_epoch_end', model=self.get_model()): + step_out = batch_output.training_step_output_for_epoch_end + should_auto_reduce_train_result = isinstance(step_out, Result) and step_out.should_reduce_on_epoch_end + if self.is_overridden('training_epoch_end', model=self.get_model()) or should_auto_reduce_train_result: epoch_output.append(batch_output.training_step_output_for_epoch_end) # update LR schedulers @@ -529,7 +531,7 @@ def run_on_epoch_end_hook(self, model): def run_training_epoch_end(self, epoch_output): model = self.get_model() - is_result_obj = isinstance(epoch_output[0], Result) + is_result_obj = len(epoch_output) > 0 and isinstance(epoch_output[0], Result) epoch_log_metrics = {} epoch_callback_metrics = {} @@ -574,13 +576,17 @@ def run_training_epoch_end(self, epoch_output): # Structured Result (auto epoch end) # -------------------------- elif is_result_obj: - # TODO: reduce outputs for user - pass + epoch_output = epoch_output[0].__class__.reduce_on_epoch_end(epoch_output) + epoch_output.minimize = epoch_output.minimize.mean() + epoch_output.early_stop_on = epoch_output.early_stop_on.mean() + epoch_output.checkpoint_on = epoch_output.checkpoint_on.mean() + epoch_log_metrics = epoch_output.epoch_log_metrics + epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics + epoch_callback_metrics = epoch_output.callback_metrics # -------------------------- # track results # -------------------------- - # TODO: do all of this for the user when no training_epoch end is defined and they used a result # add the metrics to the loggers self.log_metrics(epoch_log_metrics, {}) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index cc5c366c7c8ec..61e63cb569703 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -101,13 +101,36 @@ def training_epoch_end_scalar(self, outputs): # -------------------------- # Result returns # -------------------------- - def training_step_result_return(self, batch, batch_idx): + def training_step_result_log_step_only(self, batch, batch_idx): acc = self.step(batch, batch_idx) + result = TrainResult(minimize=acc) + + # step only metrics + result.log('step_log_and_pbar_acc1', torch.tensor(11).type_as(acc), prog_bar=True) + result.log('step_log_acc2', torch.tensor(12).type_as(acc)) + result.log('step_pbar_acc3', torch.tensor(13).type_as(acc), logger=False, prog_bar=True) + + self.training_step_called = True + return result + + def training_step_result_log_epoch_only(self, batch, batch_idx): + acc = self.step(batch, batch_idx) + result = TrainResult(minimize=acc) + + result.log('epoch_log_and_pbar_acc1', torch.tensor(14).type_as(acc), on_epoch=True, prog_bar=True) + result.log('epoch_log_acc2', torch.tensor(15).type_as(acc), on_epoch=True) + result.log('epoch_pbar_acc3', torch.tensor(16).type_as(acc), on_epoch=True, logger=False, prog_bar=True) + self.training_step_called = True + return result + + def training_step_result_log_epoch_and_step(self, batch, batch_idx): + acc = self.step(batch, batch_idx) result = TrainResult(minimize=acc) - result.log('log_and_pbar_acc1', torch.tensor(12).type_as(acc), reduce_on_epoch_end=True, prog_bar=True) - result.log('log_acc2', torch.tensor(7).type_as(acc), reduce_on_epoch_end=True) - result.log('pbar_acc3', torch.tensor(17).type_as(acc), reduce_on_epoch_end=True, logger=False, prog_bar=True) + + result.log('step_epoch_log_and_pbar_acc1', torch.tensor(17).type_as(acc), on_epoch=True, prog_bar=True) + result.log('step_epoch_log_acc2', torch.tensor(18).type_as(acc), on_epoch=True) + result.log('step_epoch_pbar_acc3', torch.tensor(19).type_as(acc), on_epoch=True, logger=False, prog_bar=True) self.training_step_called = True return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 335981314c13f..db49147beca80 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -13,7 +13,7 @@ # same for epoch_end # make sure to auto-reduce when no epoch_end is implemented -def test_training_step_result(tmpdir): +def test_training_step_result_log_step_only(tmpdir): """ Tests that only training_step can be used with TrainResult Makes sure that things are routed to pbar, loggers and loss accordingly @@ -22,7 +22,9 @@ def test_training_step_result(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() - model.training_step = model.training_step_result_return + model.training_step = model.training_step_result_log_step_only + model.training_step_end = None + model.training_epoch_end = None model.val_dataloader = None trainer = Trainer( @@ -37,46 +39,63 @@ def test_training_step_result(tmpdir): assert not model.training_step_end_called assert not model.training_epoch_end_called + # make sure correct metrics are logged + assert len(trainer.debug_logged_metrics) == 2 + logged_metrics = trainer.debug_logged_metrics[0] + assert logged_metrics['step_log_and_pbar_acc1'] == 11.0 + assert logged_metrics['step_log_acc2'] == 12.0 + assert 'step_pbar_acc3' not in logged_metrics + assert len(logged_metrics) == 3 + + # make sure we are using the correct metrics for callbacks + assert trainer.callback_metrics['early_stop_on'] == 171 + assert trainer.callback_metrics['checkpoint_on'] == 171 + + # make sure pbar metrics are correct ang log metrics did not leak + assert trainer.progress_bar_metrics['step_log_and_pbar_acc1'] == 11 + assert trainer.progress_bar_metrics['step_pbar_acc3'] == 13 + assert 'step_log_acc2' not in trainer.progress_bar_metrics + # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): break out = trainer.run_training_batch(batch, batch_idx) assert out.signal == 0 - assert out.batch_log_metrics['log_and_pbar_acc1'] == 12.0 - assert out.batch_log_metrics['log_acc2'] == 7.0 + assert out.batch_log_metrics['step_log_and_pbar_acc1'] == 11.0 + assert out.batch_log_metrics['step_log_acc2'] == 12.0 train_step_out = out.training_step_output_for_epoch_end assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out - assert 'log_and_pbar_acc1' in train_step_out - assert 'log_acc2' in train_step_out - - # make sure we are using the correct metrics for callbacks - assert trainer.callback_metrics['early_stop_on'] == 171 - assert trainer.callback_metrics['checkpoint_on'] == 171 - - # make sure pbar metrics are correct - assert trainer.progress_bar_metrics['log_and_pbar_acc1'] == 12 - assert trainer.progress_bar_metrics['pbar_acc3'] == 17 - assert 'log_acc2' not in trainer.progress_bar_metrics - - # make sure correct metrics are logged - assert len(trainer.debug_logged_metrics) == 1 - logged_metrics = trainer.debug_logged_metrics[0] - assert logged_metrics['log_and_pbar_acc1'] == 12.0 - assert logged_metrics['log_acc2'] == 7.0 - assert 'pbar_acc3' not in logged_metrics - assert len(logged_metrics) == 3 + assert 'step_log_and_pbar_acc1' in train_step_out + assert 'step_log_acc2' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) +test_training_step_result_log_step_only('') + +def test_training_step_auto_reduce(tmpdir): # TODO: test that it gets reduced on epoch end # TODO: test that on batch end gets reduced + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_result_return + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + weights_summary=None, + ) + trainer.fit(model) + + def test_training_step_epoch_end_result(tmpdir): """ From 692731342d69dde45d3fc8c102cf69ad6e85dcd1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 12:46:56 -0400 Subject: [PATCH 023/168] added train step structured result --- tests/base/deterministic_model.py | 6 +-- .../test_trainer_steps_result_return.py | 37 +++++++++++-------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 61e63cb569703..334c322c2d6a8 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -106,9 +106,9 @@ def training_step_result_log_step_only(self, batch, batch_idx): result = TrainResult(minimize=acc) # step only metrics - result.log('step_log_and_pbar_acc1', torch.tensor(11).type_as(acc), prog_bar=True) - result.log('step_log_acc2', torch.tensor(12).type_as(acc)) - result.log('step_pbar_acc3', torch.tensor(13).type_as(acc), logger=False, prog_bar=True) + result.log(f'step_log_and_pbar_acc1_b{batch_idx}', torch.tensor(11).type_as(acc), prog_bar=True) + result.log(f'step_log_acc2_b{batch_idx}', torch.tensor(12).type_as(acc)) + result.log(f'step_pbar_acc3_b{batch_idx}', torch.tensor(13).type_as(acc), logger=False, prog_bar=True) self.training_step_called = True return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index db49147beca80..1b7a700a75944 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -17,6 +17,8 @@ def test_training_step_result_log_step_only(tmpdir): """ Tests that only training_step can be used with TrainResult Makes sure that things are routed to pbar, loggers and loss accordingly + + Makes sure pbar and logs happen on step only when requested """ # enable internal debugging actions os.environ['PL_DEV_DEBUG'] = '1' @@ -27,9 +29,13 @@ def test_training_step_result_log_step_only(tmpdir): model.training_epoch_end = None model.val_dataloader = None + batches = 3 trainer = Trainer( default_root_dir=tmpdir, - fast_dev_run=True, + limit_train_batches=batches, + limit_val_batches=batches, + row_log_interval=1, + max_epochs=1, weights_summary=None, ) trainer.fit(model) @@ -39,22 +45,23 @@ def test_training_step_result_log_step_only(tmpdir): assert not model.training_step_end_called assert not model.training_epoch_end_called - # make sure correct metrics are logged - assert len(trainer.debug_logged_metrics) == 2 - logged_metrics = trainer.debug_logged_metrics[0] - assert logged_metrics['step_log_and_pbar_acc1'] == 11.0 - assert logged_metrics['step_log_acc2'] == 12.0 - assert 'step_pbar_acc3' not in logged_metrics - assert len(logged_metrics) == 3 + # make sure correct metrics are logged (one per batch step as requested) + assert len(trainer.debug_logged_metrics) == batches + 1 + for batch_idx, logged_metrics in enumerate(trainer.debug_logged_metrics[:-1]): + assert logged_metrics[f'step_log_and_pbar_acc1_b{batch_idx}'] == 11.0 + assert logged_metrics[f'step_log_acc2_b{batch_idx}'] == 12.0 + assert f'step_pbar_acc3_b{batch_idx}' not in logged_metrics + assert len(logged_metrics) == 3 # make sure we are using the correct metrics for callbacks assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 # make sure pbar metrics are correct ang log metrics did not leak - assert trainer.progress_bar_metrics['step_log_and_pbar_acc1'] == 11 - assert trainer.progress_bar_metrics['step_pbar_acc3'] == 13 - assert 'step_log_acc2' not in trainer.progress_bar_metrics + for batch_idx in range(batches): + assert trainer.progress_bar_metrics[f'step_log_and_pbar_acc1_b{batch_idx}'] == 11 + assert trainer.progress_bar_metrics[f'step_pbar_acc3_b{batch_idx}'] == 13 + assert f'step_log_acc2_b{batch_idx}' not in trainer.progress_bar_metrics # make sure training outputs what is expected for batch_idx, batch in enumerate(model.train_dataloader()): @@ -62,15 +69,15 @@ def test_training_step_result_log_step_only(tmpdir): out = trainer.run_training_batch(batch, batch_idx) assert out.signal == 0 - assert out.batch_log_metrics['step_log_and_pbar_acc1'] == 11.0 - assert out.batch_log_metrics['step_log_acc2'] == 12.0 + assert out.batch_log_metrics[f'step_log_and_pbar_acc1_b{batch_idx}'] == 11.0 + assert out.batch_log_metrics[f'step_log_acc2_b{batch_idx}'] == 12.0 train_step_out = out.training_step_output_for_epoch_end assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out - assert 'step_log_and_pbar_acc1' in train_step_out - assert 'step_log_acc2' in train_step_out + assert f'step_log_and_pbar_acc1_b{batch_idx}' in train_step_out + assert f'step_log_acc2_b{batch_idx}' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) From 1c78a5b0de21066edb25a65a3ed843cfeeda376a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 13:08:45 -0400 Subject: [PATCH 024/168] added train step structured result --- pytorch_lightning/core/step_result.py | 8 +- tests/base/deterministic_model.py | 6 +- .../test_trainer_steps_result_return.py | 75 +++++++++++++++++++ 3 files changed, 82 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index b13bffa29fe25..ec1f6a3e1014e 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -135,7 +135,7 @@ def get_batch_log_metrics(self): for k, options in meta.items(): if k == '_internal': continue - if options['logger']: + if options['logger'] and options['on_step']: result[k] = self[k] return result @@ -149,7 +149,7 @@ def get_epoch_log_metrics(self): for k, options in meta.items(): if k == '_internal': continue - if options['logger']: + if options['logger'] and options['on_epoch']: result[k] = self[k] return result @@ -163,7 +163,7 @@ def get_epoch_pbar_metrics(self): for k, options in meta.items(): if k == '_internal': continue - if options['prog_bar']: + if options['prog_bar'] and options['on_epoch']: result[k] = self[k] return result @@ -177,7 +177,7 @@ def get_batch_pbar_metrics(self): for k, options in meta.items(): if k == '_internal': continue - if options['prog_bar']: + if options['prog_bar'] and options['on_step']: result[k] = self[k] return result diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 334c322c2d6a8..78420a47c94db 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -117,9 +117,9 @@ def training_step_result_log_epoch_only(self, batch, batch_idx): acc = self.step(batch, batch_idx) result = TrainResult(minimize=acc) - result.log('epoch_log_and_pbar_acc1', torch.tensor(14).type_as(acc), on_epoch=True, prog_bar=True) - result.log('epoch_log_acc2', torch.tensor(15).type_as(acc), on_epoch=True) - result.log('epoch_pbar_acc3', torch.tensor(16).type_as(acc), on_epoch=True, logger=False, prog_bar=True) + result.log(f'epoch_log_and_pbar_acc1_e{self.current_epoch}', torch.tensor(14).type_as(acc), on_epoch=True, prog_bar=True, on_step=False) + result.log(f'epoch_log_acc2_e{self.current_epoch}', torch.tensor(15).type_as(acc), on_epoch=True, on_step=False) + result.log(f'epoch_pbar_acc3_e{self.current_epoch}', torch.tensor(16).type_as(acc), on_epoch=True, logger=False, prog_bar=True, on_step=False) self.training_step_called = True return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 1b7a700a75944..b317a51a453be 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -83,7 +83,82 @@ def test_training_step_result_log_step_only(tmpdir): opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + +def test_training_step_result_log_epoch_only(tmpdir): + """ + Tests that only training_step can be used with TrainResult + Makes sure that things are routed to pbar, loggers and loss accordingly + + Makes sure pbar and logs happen on epoch only when requested + """ + # enable internal debugging actions + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_result_log_epoch_only + model.training_step_end = None + model.training_epoch_end = None + model.val_dataloader = None + + epochs = 3 + batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=batches, + limit_val_batches=batches, + row_log_interval=1, + max_epochs=epochs, + weights_summary=None, + ) + trainer.fit(model) + + # make sure correct steps were called + assert model.training_step_called + assert not model.training_step_end_called + assert not model.training_epoch_end_called + + # make sure correct metrics are logged (one per batch step as requested) + assert len(trainer.debug_logged_metrics) == epochs * (batches + 1) + epoch_metrics = [x for x in trainer.debug_logged_metrics if len(x) > 1] + assert len(epoch_metrics) == epochs + for batch_idx, logged_metrics in enumerate(epoch_metrics): + assert logged_metrics[f'epoch_log_and_pbar_acc1_e{batch_idx}'] == 14.0 + assert logged_metrics[f'epoch_log_acc2_e{batch_idx}'] == 15.0 + assert f'epoch_pbar_acc3_e{batch_idx}' not in logged_metrics + assert len(logged_metrics) == 3 + + # make sure we are using the correct metrics for callbacks + assert trainer.callback_metrics['early_stop_on'] == 171 + assert trainer.callback_metrics['checkpoint_on'] == 171 + + # make sure pbar metrics are correct ang log metrics did not leak + for epoch_idx in range(epochs): + assert trainer.progress_bar_metrics[f'epoch_log_and_pbar_acc1_e{epoch_idx}'] == 14 + assert trainer.progress_bar_metrics[f'epoch_pbar_acc3_e{epoch_idx}'] == 16 + assert f'epoch_log_acc2_e{epoch_idx}' not in trainer.progress_bar_metrics + + # make sure training outputs what is expected + for batch_idx, batch in enumerate(model.train_dataloader()): + break + + out = trainer.run_training_batch(batch, batch_idx) + assert out.signal == 0 + assert len(out.batch_log_metrics) == 0 + + train_step_out = out.training_step_output_for_epoch_end + assert isinstance(train_step_out, TrainResult) + + assert 'minimize' in train_step_out + assert f'epoch_log_and_pbar_acc1_e{trainer.current_epoch}' in train_step_out + assert f'epoch_log_acc2_e{trainer.current_epoch}' in train_step_out + + # make sure the optimizer closure returns the correct things + opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) + assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + test_training_step_result_log_step_only('') +test_training_step_result_log_epoch_only('') +print('a') def test_training_step_auto_reduce(tmpdir): # TODO: test that it gets reduced on epoch end From 5c67538811a598efde23def9af60fb6269f56130 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 13:10:58 -0400 Subject: [PATCH 025/168] added train step structured result --- .../test_trainer_steps_result_return.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index b317a51a453be..c32d698fe90ad 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -156,8 +156,28 @@ def test_training_step_result_log_epoch_only(tmpdir): opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + +def test_training_step_result_log_step_and_epoch(tmpdir): + """ + Tests that only training_step can be used with TrainResult + Makes sure that things are routed to pbar, loggers and loss accordingly + + Makes sure pbar and logs happen on epoch only when requested + """ + # enable internal debugging actions + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_result_log_epoch_and_step + model.training_step_end = None + model.training_epoch_end = None + model.val_dataloader = None + # TODO + + test_training_step_result_log_step_only('') test_training_step_result_log_epoch_only('') +test_training_step_result_log_step_and_epoch('') print('a') def test_training_step_auto_reduce(tmpdir): From 870259cd026ebfc930685358c9160cff58dc709a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 14:21:16 -0400 Subject: [PATCH 026/168] added train step structured result --- pytorch_lightning/trainer/training_loop.py | 7 +- tests/base/deterministic_model.py | 9 ++- .../test_trainer_steps_result_return.py | 66 +++++++++++++++++-- 3 files changed, 70 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index b2831859dfd1b..053c3c166843b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -588,7 +588,8 @@ def run_training_epoch_end(self, epoch_output): # track results # -------------------------- # add the metrics to the loggers - self.log_metrics(epoch_log_metrics, {}) + if epoch_log_metrics and len(epoch_log_metrics) > 0: + self.log_metrics(epoch_log_metrics, {}) # add metrics to callbacks self.callback_metrics.update(epoch_callback_metrics) @@ -611,7 +612,9 @@ def save_train_loop_metrics_to_loggers(self, batch_idx, batch_output): should_log_metrics = batch_idx % self.row_log_interval == 0 or self.should_stop if should_log_metrics or self.fast_dev_run: # logs user requested information to logger - self.log_metrics(batch_output.batch_log_metrics, batch_output.grad_norm_dic) + metrics = batch_output.batch_log_metrics + if len(metrics) > 0: + self.log_metrics(metrics, batch_output.grad_norm_dic) def save_loggers_in_training_loop(self, batch_idx): # when loggers should save to disk diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 78420a47c94db..fc1d8793ce184 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -128,9 +128,12 @@ def training_step_result_log_epoch_and_step(self, batch, batch_idx): acc = self.step(batch, batch_idx) result = TrainResult(minimize=acc) - result.log('step_epoch_log_and_pbar_acc1', torch.tensor(17).type_as(acc), on_epoch=True, prog_bar=True) - result.log('step_epoch_log_acc2', torch.tensor(18).type_as(acc), on_epoch=True) - result.log('step_epoch_pbar_acc3', torch.tensor(19).type_as(acc), on_epoch=True, logger=False, prog_bar=True) + val_1 = (5 + batch_idx) * (self.current_epoch + 1) + val_2 = (6 + batch_idx) * (self.current_epoch + 1) + val_3 = (7 + batch_idx) * (self.current_epoch + 1) + result.log(f'step_epoch_log_and_pbar_acc1', torch.tensor(val_1).type_as(acc), on_epoch=True, prog_bar=True) + result.log(f'step_epoch_log_acc2', torch.tensor(val_2).type_as(acc), on_epoch=True) + result.log(f'step_epoch_pbar_acc3', torch.tensor(val_3).type_as(acc), on_epoch=True, logger=False, prog_bar=True) self.training_step_called = True return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index c32d698fe90ad..fe7d88cde1a44 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -2,6 +2,7 @@ Tests to ensure that the training loop works with a dict """ import os +import torch from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult @@ -46,8 +47,8 @@ def test_training_step_result_log_step_only(tmpdir): assert not model.training_epoch_end_called # make sure correct metrics are logged (one per batch step as requested) - assert len(trainer.debug_logged_metrics) == batches + 1 - for batch_idx, logged_metrics in enumerate(trainer.debug_logged_metrics[:-1]): + assert len(trainer.debug_logged_metrics) == batches + for batch_idx, logged_metrics in enumerate(trainer.debug_logged_metrics): assert logged_metrics[f'step_log_and_pbar_acc1_b{batch_idx}'] == 11.0 assert logged_metrics[f'step_log_acc2_b{batch_idx}'] == 12.0 assert f'step_pbar_acc3_b{batch_idx}' not in logged_metrics @@ -118,8 +119,8 @@ def test_training_step_result_log_epoch_only(tmpdir): assert not model.training_epoch_end_called # make sure correct metrics are logged (one per batch step as requested) - assert len(trainer.debug_logged_metrics) == epochs * (batches + 1) - epoch_metrics = [x for x in trainer.debug_logged_metrics if len(x) > 1] + assert len(trainer.debug_logged_metrics) == epochs + epoch_metrics = trainer.debug_logged_metrics assert len(epoch_metrics) == epochs for batch_idx, logged_metrics in enumerate(epoch_metrics): assert logged_metrics[f'epoch_log_and_pbar_acc1_e{batch_idx}'] == 14.0 @@ -172,11 +173,62 @@ def test_training_step_result_log_step_and_epoch(tmpdir): model.training_step_end = None model.training_epoch_end = None model.val_dataloader = None - # TODO + epochs = 3 + batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=batches, + limit_val_batches=batches, + row_log_interval=1, + max_epochs=epochs, + weights_summary=None, + ) + trainer.fit(model) + + # make sure correct steps were called + assert model.training_step_called + assert not model.training_step_end_called + assert not model.training_epoch_end_called + + # make sure correct metrics are logged (one per batch step as requested) + assert len(trainer.debug_logged_metrics) == (epochs * batches) + epochs + epoch_metrics = trainer.debug_logged_metrics + epoch_idx = -1 + for i_start in range(0, len(epoch_metrics), batches + 1): + epoch_idx += 1 + epoch_outputs = epoch_metrics[i_start: i_start + batches + 1] + mean_vals = { + 'step_epoch_log_and_pbar_acc1': [], + 'step_epoch_log_acc2': [] + } + + # make sure each batch logged the expected value + for batch_idx in range(len(epoch_outputs) - 1): + logged_metrics = epoch_outputs[batch_idx] + + expected_val_1 = (5 + batch_idx) * (epoch_idx + 1) + expected_val_2 = (6 + batch_idx) * (epoch_idx + 1) + mean_vals['step_epoch_log_and_pbar_acc1'].append(torch.tensor(expected_val_1).float()) + mean_vals['step_epoch_log_acc2'].append(torch.tensor(expected_val_2).float()) + assert logged_metrics['step_epoch_log_and_pbar_acc1'] == expected_val_1 + assert logged_metrics['step_epoch_log_acc2'] == expected_val_2 + assert 'step_epoch_pbar_acc3' not in logged_metrics + assert len(logged_metrics) == 3 + + # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches + epoch_end_metrics = epoch_outputs[-1] + eval_1 = torch.stack(mean_vals['step_epoch_log_and_pbar_acc1']).mean() + eval_2 = torch.stack(mean_vals['step_epoch_log_acc2']).mean() + assert epoch_end_metrics['step_epoch_log_and_pbar_acc1'] == eval_1 + assert epoch_end_metrics['step_epoch_log_acc2'] == eval_2 + assert 'step_epoch_pbar_acc3' not in epoch_end_metrics + assert len(logged_metrics) == 3 + + print('a') -test_training_step_result_log_step_only('') -test_training_step_result_log_epoch_only('') +# test_training_step_result_log_step_only('') +# test_training_step_result_log_epoch_only('') test_training_step_result_log_step_and_epoch('') print('a') From 4a36ea5dfcd10c16f3e7a7ae5d4635809d3a5b97 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 14:49:03 -0400 Subject: [PATCH 027/168] added autoreduce for train step --- pytorch_lightning/trainer/logging.py | 4 ++ pytorch_lightning/trainer/trainer.py | 1 + .../test_trainer_steps_result_return.py | 67 ++++++++++++++++++- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index 17b48aeface66..4425da3a20d46 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -84,6 +84,10 @@ def add_progress_bar_metrics(self, metrics): self.progress_bar_metrics[k] = v + if 'PL_DEV_DEBUG' in os.environ: + metrics['debug_epoch'] = self.current_epoch + self.debug_pbar_added_metrics.append(metrics) + def metrics_to_scalars(self, metrics): new_metrics = {} for k, v in metrics.items(): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b626280f17b5e..3b9ef8572b0f0 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -618,6 +618,7 @@ def __init__( # for debugging purposes only, track the logged metrics self.debug_logged_metrics = [] + self.debug_pbar_added_metrics = [] # Callback system self.on_init_end() diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index fe7d88cde1a44..af3d875a8f575 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -225,10 +225,71 @@ def test_training_step_result_log_step_and_epoch(tmpdir): assert 'step_epoch_pbar_acc3' not in epoch_end_metrics assert len(logged_metrics) == 3 - print('a') + # make sure we are using the correct metrics for callbacks + assert trainer.callback_metrics['early_stop_on'] == 171 + assert trainer.callback_metrics['checkpoint_on'] == 171 + + # ------------------------------- + # VERIFY PBAR METRICS + # ------------------------------- + # make sure pbar metrics are correct ang log metrics did not leak + all_pbar_metrics = trainer.debug_pbar_added_metrics + assert len(all_pbar_metrics) == (epochs * batches) + epochs + + epoch_idx = -1 + for i_start in range(0, len(all_pbar_metrics), batches + 1): + epoch_idx += 1 + epoch_outputs = all_pbar_metrics[i_start: i_start + batches + 1] + mean_vals = { + 'step_epoch_log_and_pbar_acc1': [], + 'step_epoch_pbar_acc3': [] + } + + # make sure each batch logged the expected value + for batch_idx in range(len(epoch_outputs) - 1): + logged_metrics = epoch_outputs[batch_idx] + + expected_val_1 = (5 + batch_idx) * (epoch_idx + 1) + expected_val_2 = (7 + batch_idx) * (epoch_idx + 1) + mean_vals['step_epoch_log_and_pbar_acc1'].append(torch.tensor(expected_val_1).float()) + mean_vals['step_epoch_pbar_acc3'].append(torch.tensor(expected_val_2).float()) + assert logged_metrics['step_epoch_log_and_pbar_acc1'] == expected_val_1 + assert logged_metrics['step_epoch_pbar_acc3'] == expected_val_2 + assert 'step_epoch_log_acc2' not in logged_metrics + assert len(logged_metrics) == 3 + + # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches + epoch_end_metrics = epoch_outputs[-1] + eval_1 = torch.stack(mean_vals['step_epoch_log_and_pbar_acc1']).mean() + eval_2 = torch.stack(mean_vals['step_epoch_pbar_acc3']).mean() + assert epoch_end_metrics['step_epoch_log_and_pbar_acc1'] == eval_1 + assert epoch_end_metrics['step_epoch_pbar_acc3'] == eval_2 + assert 'step_epoch_log_acc2' not in epoch_end_metrics + assert len(logged_metrics) == 3 + + # ----------------------------------------- + # make sure training outputs what is expected + # ----------------------------------------- + for batch_idx, batch in enumerate(model.train_dataloader()): + break + + out = trainer.run_training_batch(batch, batch_idx) + assert out.signal == 0 + assert len(out.batch_log_metrics) == 2 + + train_step_out = out.training_step_output_for_epoch_end + assert isinstance(train_step_out, TrainResult) + + assert 'minimize' in train_step_out + assert f'step_epoch_log_and_pbar_acc1' in train_step_out + assert f'step_epoch_log_acc2' in train_step_out + + # make sure the optimizer closure returns the correct things + opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) + assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) -# test_training_step_result_log_step_only('') -# test_training_step_result_log_epoch_only('') +test_training_step_result_log_step_only('') +test_training_step_result_log_epoch_only('') test_training_step_result_log_step_and_epoch('') print('a') From 4837cf4461357db6c427e36e12d842ec7aa8c8c9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 15:12:02 -0400 Subject: [PATCH 028/168] added auto reduce on train --- tests/base/deterministic_model.py | 14 ++-- .../test_trainer_steps_result_return.py | 82 +++++++++++-------- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index fc1d8793ce184..96e879f921119 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -138,7 +138,7 @@ def training_step_result_log_epoch_and_step(self, batch, batch_idx): self.training_step_called = True return result - def training_epoch_end_return(self, result): + def training_epoch_end_return_for_log_epoch_and_step(self, result): """ There should be an array of scalars without graphs that are all 171 (4 of them) """ @@ -150,12 +150,12 @@ def training_epoch_end_return(self, result): # only saw 4 batches assert isinstance(result, TrainResult) - result.log_acc2 = result.log_acc2.mean() + 11 - result.log_and_pbar_acc1 = result.log_and_pbar_acc1.mean() + 11 - result.pbar_acc3 = result.pbar_acc3.mean() + 11 - result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.pbar_acc3), logger=True) - result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.pbar_acc3), logger=False, prog_bar=True) - result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.pbar_acc3), logger=True, prog_bar=True) + result.step_epoch_log_and_pbar_acc1 = result.step_epoch_log_and_pbar_acc1.prod() + result.step_epoch_log_acc2 = result.step_epoch_log_acc2.prod() + result.step_epoch_pbar_acc3 = result.step_epoch_pbar_acc3.prod() + result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.step_epoch_log_acc2), logger=True, on_epoch=True) + result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.step_epoch_log_acc2), logger=False, prog_bar=True, on_epoch=True) + result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.step_epoch_log_acc2), logger=True, prog_bar=True, on_epoch=True) return result # -------------------------- diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index af3d875a8f575..d402e98ef8371 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -288,29 +288,6 @@ def test_training_step_result_log_step_and_epoch(tmpdir): opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) -test_training_step_result_log_step_only('') -test_training_step_result_log_epoch_only('') -test_training_step_result_log_step_and_epoch('') -print('a') - -def test_training_step_auto_reduce(tmpdir): - # TODO: test that it gets reduced on epoch end - # TODO: test that on batch end gets reduced - - os.environ['PL_DEV_DEBUG'] = '1' - - model = DeterministicModel() - model.training_step = model.training_step_result_return - model.val_dataloader = None - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - weights_summary=None, - ) - trainer.fit(model) - - def test_training_step_epoch_end_result(tmpdir): """ @@ -319,13 +296,17 @@ def test_training_step_epoch_end_result(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() - model.training_step = model.training_step_result_return - model.training_epoch_end = model.training_epoch_end_return + model.training_step = model.training_step_result_log_epoch_and_step + model.training_epoch_end = model.training_epoch_end_return_for_log_epoch_and_step model.val_dataloader = None + batches = 3 + epochs = 1 trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1, + max_epochs=epochs, + row_log_interval=1, + limit_train_batches=batches, weights_summary=None, ) trainer.fit(model) @@ -336,16 +317,22 @@ def test_training_step_epoch_end_result(tmpdir): assert model.training_epoch_end_called # make sure correct metrics were logged - logged_metrics = trainer.debug_logged_metrics[-1] - assert logged_metrics['log_and_pbar_acc1'] == 23.0 - assert logged_metrics['log_acc2'] == 18.0 - assert logged_metrics['epoch_end_log_acc'] == 1212.0 - assert logged_metrics['epoch_end_log_pbar_acc'] == 1214.0 - assert 'epoch_end_pbar_acc' not in logged_metrics + logged_metrics = trainer.debug_logged_metrics + assert len(logged_metrics) == (epochs * batches) + epochs + last_logged = logged_metrics[-1] + + assert last_logged['step_epoch_log_and_pbar_acc1'] == 210.0 + assert last_logged['step_epoch_log_acc2'] == 336.0 + assert last_logged['epoch_end_log_acc'] == 1212.0 + assert last_logged['epoch_end_log_pbar_acc'] == 1214.0 + assert 'epoch_end_pbar_acc' not in last_logged # make sure pbar metrics are correct - assert trainer.progress_bar_metrics['log_and_pbar_acc1'] == 23.0 - assert trainer.progress_bar_metrics['pbar_acc3'] == 28.0 + logged_pbar = trainer.debug_pbar_added_metrics + assert len(logged_pbar) == (epochs * batches) + epochs + + assert trainer.progress_bar_metrics['step_epoch_log_and_pbar_acc1'] == 210.0 + assert trainer.progress_bar_metrics['step_epoch_pbar_acc3'] == 504.0 assert trainer.progress_bar_metrics['epoch_end_pbar_acc'] == 1213.0 assert trainer.progress_bar_metrics['epoch_end_log_pbar_acc'] == 1214.0 assert 'epoch_end_log_acc' not in trainer.progress_bar_metrics @@ -354,3 +341,30 @@ def test_training_step_epoch_end_result(tmpdir): # make sure callback metrics didn't change assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 + + # ----------------------------------------- + # make sure training outputs what is expected + # ----------------------------------------- + for batch_idx, batch in enumerate(model.train_dataloader()): + break + + out = trainer.run_training_batch(batch, batch_idx) + assert out.signal == 0 + assert len(out.batch_log_metrics) == 2 + + train_step_out = out.training_step_output_for_epoch_end + assert isinstance(train_step_out, TrainResult) + + assert 'minimize' in train_step_out + assert f'step_epoch_log_and_pbar_acc1' in train_step_out + assert f'step_epoch_log_acc2' in train_step_out + + # make sure the optimizer closure returns the correct things + opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) + assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + +test_training_step_result_log_step_only('') +test_training_step_result_log_epoch_only('') +test_training_step_result_log_step_and_epoch('') +test_training_step_epoch_end_result('') +print('a') From 63789856d13ef50d4e332050bd48c91aec1f8b2b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 15:12:14 -0400 Subject: [PATCH 029/168] added auto reduce on train --- tests/trainer/test_trainer_steps_result_return.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index d402e98ef8371..c2e5a770ad69f 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -362,9 +362,3 @@ def test_training_step_epoch_end_result(tmpdir): # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) - -test_training_step_result_log_step_only('') -test_training_step_result_log_epoch_only('') -test_training_step_result_log_step_and_epoch('') -test_training_step_epoch_end_result('') -print('a') From f7f654a5e5829ede14818dff7ba4e6fbe359f52b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 15:12:40 -0400 Subject: [PATCH 030/168] added auto reduce on train --- tests/trainer/test_trainer_steps_result_return.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index c2e5a770ad69f..88f20f32a2d08 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -10,9 +10,6 @@ # TODOs: # make checkpoint and early stopping use the correct metrics -# make sure step_ends receive a plain dict -# same for epoch_end -# make sure to auto-reduce when no epoch_end is implemented def test_training_step_result_log_step_only(tmpdir): """ From 73fd54b043dae4cb0d0a4694221028fd6e59486c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 15:13:23 -0400 Subject: [PATCH 031/168] added auto reduce on train --- tests/trainer/test_trainer_steps_result_return.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 88f20f32a2d08..42dafd3864878 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -10,6 +10,7 @@ # TODOs: # make checkpoint and early stopping use the correct metrics +# test with train_step_end def test_training_step_result_log_step_only(tmpdir): """ From b3f38c2594ce2dd507b1eb98fa568acbb85f464a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 15:14:10 -0400 Subject: [PATCH 032/168] added auto reduce on train --- tests/trainer/test_trainer_steps_result_return.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 42dafd3864878..5ee2be98e3a43 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -11,6 +11,7 @@ # TODOs: # make checkpoint and early stopping use the correct metrics # test with train_step_end +# add logging + row interval tests def test_training_step_result_log_step_only(tmpdir): """ From 6e64ba95bc5e8c43ac30a79cf1bfd0c66fdddae2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 15:52:49 -0400 Subject: [PATCH 033/168] added auto reduce on train --- pytorch_lightning/callbacks/early_stopping.py | 5 +++ .../callbacks/model_checkpoint.py | 5 +++ pytorch_lightning/core/step_result.py | 2 - pytorch_lightning/trainer/trainer.py | 5 ++- pytorch_lightning/trainer/training_loop.py | 7 ++- tests/base/deterministic_model.py | 19 ++++++++ .../test_trainer_steps_result_return.py | 43 +++++++++++++++++++ 7 files changed, 82 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 544854fa4e983..4248ca2cfabd9 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -142,6 +142,11 @@ def on_validation_end(self, trainer, pl_module): def _run_early_stopping_check(self, trainer, pl_module): logs = trainer.callback_metrics + + # support structured results + if 'early_stop_on' in logs and logs['early_stop_on'] is not None: + self.monitor = 'early_stop_on' + if not self._validate_condition_metric(logs): return # short circuit if metric not present diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index f70d8d8d0a5e1..d3e8544f85217 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -270,6 +270,11 @@ def on_validation_end(self, trainer, pl_module): metrics = trainer.callback_metrics epoch = trainer.current_epoch + + # support structured results + if 'checkpoint_on' in metrics and metrics['checkpoint_on'] is not None: + self.monitor = 'checkpoint_on' + if self.save_top_k == 0: # no models are saved return diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index ec1f6a3e1014e..117e1ea45a325 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -27,8 +27,6 @@ def __init__( self._assert_grad_tensor_metric('minimize', minimize, err) self.minimize = minimize - if minimize is not None and early_stop_on is None: - self.early_stop_on = minimize.detach() if minimize is not None and checkpoint_on is None: self.checkpoint_on = minimize.detach() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 3b9ef8572b0f0..80896005f33dd 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -616,9 +616,12 @@ def __init__( self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') - # for debugging purposes only, track the logged metrics + # --------------------------- + # only active when debugging PL for dev purposes and tests + # --------------------------- self.debug_logged_metrics = [] self.debug_pbar_added_metrics = [] + self.debug_saved_losses = [] # Callback system self.on_init_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 053c3c166843b..862a42be8d327 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -143,7 +143,7 @@ def training_step(self, batch, batch_idx): trainer = Trainer(terminate_on_nan=True) """ - +import os import subprocess from abc import ABC, abstractmethod from typing import Callable @@ -925,6 +925,11 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens) with self.profiler.profile('on_after_backward'): model_ref.on_after_backward() + # when in dev debugging track the losses + if 'PL_DEV_DEBUG' in os.environ: + loss_dict = {'batch_idx': batch_idx, 'epoch': self.current_epoch, 'loss': untouched_loss.detach()} + self.debug_saved_losses.append(loss_dict) + result = AttributeDict( loss=untouched_loss, training_step_output=training_step_output, diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 96e879f921119..a21af96da95f8 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -101,6 +101,25 @@ def training_epoch_end_scalar(self, outputs): # -------------------------- # Result returns # -------------------------- + def training_step_no_default_callbacks_for_train_loop(self, batch, batch_idx): + """ + Early stop and checkpoint only on these values + """ + acc = self.step(batch, batch_idx) + result = TrainResult(minimize=acc) + assert 'early_step_on' not in result + assert 'checkpoint_on' in result + return result + + def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx): + """ + Early stop and checkpoint only on these values + """ + losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20, 22] + loss = losses[batch_idx] + result = TrainResult(minimize=loss, early_stop_on=loss, checkpoint_on=loss) + return result + def training_step_result_log_step_only(self, batch, batch_idx): acc = self.step(batch, batch_idx) result = TrainResult(minimize=acc) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 5ee2be98e3a43..aa9ef0765acfd 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -361,3 +361,46 @@ def test_training_step_epoch_end_result(tmpdir): # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'] == (42.0 * 3) + (15.0 * 3) + + +def test_no_auto_callbacks_with_train_loop_only(tmpdir): + """ + Make sure early stop + checkpoint work with only a train loop + """ + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_no_default_callbacks_for_train_loop + model.training_epoch_end = None + model.val_dataloader = None + + batches = 3 + epochs = 3 + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=epochs, + row_log_interval=1, + limit_train_batches=batches, + weights_summary=None, + ) + trainer.fit(model) + + all_losses = trainer.debug_saved_losses + assert len(all_losses) == batches * epochs + + assert trainer.checkpoint_callback.monitor == 'checkpoint_on' + assert trainer.early_stop_callback is None + + trainer = Trainer( + default_root_dir=tmpdir, + early_stop_callback=True, + max_epochs=epochs, + row_log_interval=1, + limit_train_batches=batches, + weights_summary=None, + ) + trainer.fit(model) + + assert trainer.early_stop_callback.monitor == 'val_loss' + +test_no_auto_callbacks_with_train_loop_only('') \ No newline at end of file From 1b24903312f248c8a6b13fbe50413619eb18298f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 16:08:52 -0400 Subject: [PATCH 034/168] added hooks --- pytorch_lightning/callbacks/base.py | 24 +++++++++++++++ pytorch_lightning/callbacks/early_stopping.py | 2 ++ pytorch_lightning/trainer/callback_hook.py | 30 +++++++++++++++++++ tests/base/deterministic_model.py | 16 ++++++---- .../test_trainer_steps_result_return.py | 26 +++++++++++++++- 5 files changed, 92 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py index dac8ddc11c093..37ef84c796ec2 100644 --- a/pytorch_lightning/callbacks/base.py +++ b/pytorch_lightning/callbacks/base.py @@ -46,6 +46,30 @@ def on_sanity_check_end(self, trainer, pl_module): """Called when the validation sanity check ends.""" pass + def on_train_epoch_start(self, trainer, pl_module): + """Called when the train epoch begins.""" + pass + + def on_train_epoch_end(self, trainer, pl_module): + """Called when the train epoch ends.""" + pass + + def on_val_epoch_start(self, trainer, pl_module): + """Called when the val epoch begins.""" + pass + + def on_val_epoch_end(self, trainer, pl_module): + """Called when the val epoch ends.""" + pass + + def on_test_epoch_start(self, trainer, pl_module): + """Called when the test epoch begins.""" + pass + + def on_test_epoch_end(self, trainer, pl_module): + """Called when the test epoch ends.""" + pass + def on_epoch_start(self, trainer, pl_module): """Called when the epoch begins.""" pass diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 4248ca2cfabd9..78ca2c31ae9c1 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -140,6 +140,8 @@ def on_sanity_check_end(self, trainer, pl_module): def on_validation_end(self, trainer, pl_module): self._run_early_stopping_check(trainer, pl_module) + def on_epoch_end(self, trainer, pl_module): + def _run_early_stopping_check(self, trainer, pl_module): logs = trainer.callback_metrics diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 50ea8bb7ce3c4..6266cccc25f1e 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -51,6 +51,36 @@ def on_sanity_check_end(self): for callback in self.callbacks: callback.on_sanity_check_end(self, self.get_model()) + def on_train_epoch_start(self): + """Called when the epoch begins.""" + for callback in self.callbacks: + callback.on_train_epoch_start(self, self.get_model()) + + def on_train_epoch_end(self): + """Called when the epoch begins.""" + for callback in self.callbacks: + callback.on_train_epoch_end(self, self.get_model()) + + def on_val_epoch_start(self): + """Called when the epoch begins.""" + for callback in self.callbacks: + callback.on_val_epoch_start(self, self.get_model()) + + def on_val_epoch_end(self): + """Called when the epoch begins.""" + for callback in self.callbacks: + callback.on_val_epoch_end(self, self.get_model()) + + def on_test_epoch_start(self): + """Called when the epoch begins.""" + for callback in self.callbacks: + callback.on_test_epoch_start(self, self.get_model()) + + def on_test_epoch_end(self): + """Called when the epoch begins.""" + for callback in self.callbacks: + callback.on_test_epoch_end(self, self.get_model()) + def on_epoch_start(self): """Called when the epoch begins.""" for callback in self.callbacks: diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index a21af96da95f8..77fc30c2c4f70 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -20,6 +20,8 @@ def __init__(self, weights=None): self.validation_step_end_called = False self.validation_epoch_end_called = False + self.assert_backward = True + self.l1 = nn.Linear(2, 3, bias=False) if weights is None: weights = torch.tensor([ @@ -115,8 +117,11 @@ def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx """ Early stop and checkpoint only on these values """ + acc = self.step(batch, batch_idx) + + self.assert_backward = False losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20, 22] - loss = losses[batch_idx] + loss = acc + losses[batch_idx] result = TrainResult(minimize=loss, early_stop_on=loss, checkpoint_on=loss) return result @@ -311,10 +316,11 @@ def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=0) def backward(self, trainer, loss, optimizer, optimizer_idx): - if self.trainer.precision == 16: - assert loss > 171 * 1000 - else: - assert loss == 171.0 + if self.assert_backward: + if self.trainer.precision == 16: + assert loss > 171 * 1000 + else: + assert loss == 171.0 loss.backward() diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index aa9ef0765acfd..038901339d02e 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -403,4 +403,28 @@ def test_no_auto_callbacks_with_train_loop_only(tmpdir): assert trainer.early_stop_callback.monitor == 'val_loss' -test_no_auto_callbacks_with_train_loop_only('') \ No newline at end of file + +def test_use_callbacks_with_train_loop_only(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks + model.training_epoch_end = None + model.val_dataloader = None + + batches = 3 + epochs = 300 + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=epochs, + early_stop_callback=True, + row_log_interval=1, + limit_train_batches=batches, + weights_summary=None, + ) + trainer.fit(model) + + all_losses = trainer.debug_saved_losses + assert len(all_losses) == batches * epochs + +test_use_callbacks_with_train_loop_only('') \ No newline at end of file From eae4d6b5dcf43f561219d9e4028076d68a5e07f8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 18 Jul 2020 16:31:47 -0400 Subject: [PATCH 035/168] added hooks --- pytorch_lightning/callbacks/early_stopping.py | 18 +++++++--- pytorch_lightning/core/hooks.py | 36 +++++++++++++++++++ pytorch_lightning/trainer/training_loop.py | 19 ++++++++++ tests/base/deterministic_model.py | 5 +-- .../test_trainer_steps_result_return.py | 3 +- 5 files changed, 73 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 78ca2c31ae9c1..6fcbeaa1e92e4 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -140,15 +140,23 @@ def on_sanity_check_end(self, trainer, pl_module): def on_validation_end(self, trainer, pl_module): self._run_early_stopping_check(trainer, pl_module) - def on_epoch_end(self, trainer, pl_module): + def on_train_epoch_end(self, trainer, pl_module): + # early stopping can also work in the train loop when there is no val loop and when using structured results + should_check_early_stop = False + if 'early_stop_on' in trainer.callback_metrics and trainer.callback_metrics['early_stop_on'] is not None: + self.monitor = 'early_stop_on' + should_check_early_stop = True + + if 'val_early_stop_on' in trainer.callback_metrics and trainer.callback_metrics['val_early_stop_on'] is not None: + self.monitor = 'val_early_stop_on' + should_check_early_stop = True + + if should_check_early_stop: + self._run_early_stopping_check(trainer, pl_module) def _run_early_stopping_check(self, trainer, pl_module): logs = trainer.callback_metrics - # support structured results - if 'early_stop_on' in logs and logs['early_stop_on'] is not None: - self.monitor = 'early_stop_on' - if not self._validate_condition_metric(logs): return # short circuit if metric not present diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index d8c2181251b45..60e93aa275d93 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -115,6 +115,42 @@ def on_epoch_end(self) -> None: """ # do something when the epoch ends + def on_train_epoch_start(self) -> None: + """ + Called in the training loop at the very beginning of the epoch. + """ + # do something when the epoch starts + + def on_train_epoch_end(self) -> None: + """ + Called in the training loop at the very end of the epoch. + """ + # do something when the epoch ends + + def on_val_epoch_start(self) -> None: + """ + Called in the training loop at the very beginning of the epoch. + """ + # do something when the epoch starts + + def on_val_epoch_end(self) -> None: + """ + Called in the training loop at the very end of the epoch. + """ + # do something when the epoch ends + + def on_test_epoch_start(self) -> None: + """ + Called in the training loop at the very beginning of the epoch. + """ + # do something when the epoch starts + + def on_test_epoch_end(self) -> None: + """ + Called in the training loop at the very end of the epoch. + """ + # do something when the epoch ends + def on_pre_performance_check(self) -> None: """ Called at the very beginning of the validation loop. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 862a42be8d327..2327dd6b09234 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -253,6 +253,8 @@ class TrainerTrainLoopMixin(ABC): on_epoch_end: Callable on_validation_end: Callable on_keyboard_interrupt: Callable + on_train_epoch_start: Callable + on_train_epoch_end: Callable @abstractmethod def get_model(self) -> LightningModule: @@ -422,6 +424,15 @@ def run_on_epoch_start_hook(self, model): if self.is_function_implemented('on_epoch_start'): model.on_epoch_start() + # Epoch start events + with self.profiler.profile('on_train_epoch_start'): + # callbacks + self.on_train_epoch_start() + + # model hooks + if self.is_function_implemented('on_train_epoch_start'): + model.on_train_epoch_start() + def run_training_epoch(self): # get model @@ -529,6 +540,14 @@ def run_on_epoch_end_hook(self, model): if self.is_function_implemented('on_epoch_end'): model.on_epoch_end() + with self.profiler.profile('on_train_epoch_end'): + # callbacks + self.on_train_epoch_end() + + # model hooks + if self.is_function_implemented('on_train_epoch_end'): + model.on_train_epoch_end() + def run_training_epoch_end(self, epoch_output): model = self.get_model() is_result_obj = len(epoch_output) > 0 and isinstance(epoch_output[0], Result) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 77fc30c2c4f70..46c0c31039793 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -120,8 +120,9 @@ def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx acc = self.step(batch, batch_idx) self.assert_backward = False - losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20, 22] - loss = acc + losses[batch_idx] + losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20, 22, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30] + idx = batch_idx + (self.current_epoch * 3) + loss = acc + losses[idx] result = TrainResult(minimize=loss, early_stop_on=loss, checkpoint_on=loss) return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 038901339d02e..b6fb56f5911c2 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -424,7 +424,8 @@ def test_use_callbacks_with_train_loop_only(tmpdir): ) trainer.fit(model) + # TODO: finish test to make sure early stopping happened when expected all_losses = trainer.debug_saved_losses - assert len(all_losses) == batches * epochs + assert len(all_losses) == 12 test_use_callbacks_with_train_loop_only('') \ No newline at end of file From 6102c8a3f0fef8a44cafbcbcc717457b3d3abd5d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 07:45:09 -0400 Subject: [PATCH 036/168] added hooks --- pytorch_lightning/callbacks/early_stopping.py | 14 +++++ pytorch_lightning/trainer/logging.py | 7 +-- pytorch_lightning/trainer/supporters.py | 14 +++++ pytorch_lightning/trainer/trainer.py | 8 +-- pytorch_lightning/trainer/training_loop.py | 52 +++++++++++-------- .../test_trainer_steps_result_return.py | 20 +++---- 6 files changed, 72 insertions(+), 43 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 6fcbeaa1e92e4..41a6865e61788 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -7,6 +7,7 @@ """ from copy import deepcopy +import os import numpy as np import torch import torch.distributed as dist @@ -161,6 +162,19 @@ def _run_early_stopping_check(self, trainer, pl_module): return # short circuit if metric not present current = logs.get(self.monitor) + + # track values for dev debugging + if 'PL_DEV_DEBUG' in os.environ: + debug_dict = { + 'epoch': trainer.current_epoch, + 'global_step': trainer.global_step, + 'rank': trainer.global_rank, + 'current': current, + 'best': self.best_score, + 'patience': self.wait_count + } + trainer.debug_early_stopping_values.append(debug_dict) + if not isinstance(current, torch.Tensor): current = torch.tensor(current, device=pl_module.device) diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index 4425da3a20d46..e7c9d7636d9ae 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -74,8 +74,7 @@ def log_metrics(self, metrics, grad_norm_dic, step=None): self.logger.agg_and_log_metrics(scalar_metrics, step=step) self.logger.save() - if 'PL_DEV_DEBUG' in os.environ: - self.debug_logged_metrics.append(scalar_metrics) + self.dev_debugger.track_logged_metrics(scalar_metrics) def add_progress_bar_metrics(self, metrics): for k, v in metrics.items(): @@ -84,9 +83,7 @@ def add_progress_bar_metrics(self, metrics): self.progress_bar_metrics[k] = v - if 'PL_DEV_DEBUG' in os.environ: - metrics['debug_epoch'] = self.current_epoch - self.debug_pbar_added_metrics.append(metrics) + self.dev_debugger.track_pbar_metrics(self, metrics) def metrics_to_scalars(self, metrics): new_metrics = {} diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index fcd21becfeac0..8853d7aaa05b0 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -76,3 +76,17 @@ def _agg_memory(self, how: str): return getattr(self.memory, how)() else: return getattr(self.memory[:self.current_idx], how)() + + +class Accumulator(object): + def __init__(self): + self.num_values = 0 + self.total = 0 + + def accumulate(self, x): + with torch.no_grad(): + self.total += x + self.num_values += 1 + + def mean(self): + return self.total / self.num_values diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 80896005f33dd..4b2ccd09a6b60 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -33,6 +33,7 @@ from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import rank_zero_warn, parsing, rank_zero_info, rank_zero_only +from pytorch_lightning.utilities.debugging import InternalDebugger import warnings # warnings to ignore in trainer @@ -616,12 +617,7 @@ def __init__( self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') - # --------------------------- - # only active when debugging PL for dev purposes and tests - # --------------------------- - self.debug_logged_metrics = [] - self.debug_pbar_added_metrics = [] - self.debug_saved_losses = [] + self.dev_debugger = InternalDebugger() # Callback system self.on_init_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 2327dd6b09234..237286d7245af 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -160,7 +160,7 @@ def training_step(self, batch, batch_idx): from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.loggers import LightningLoggerBase -from pytorch_lightning.trainer.supporters import TensorRunningAccum +from pytorch_lightning.trainer.supporters import TensorRunningAccum, Accumulator from pytorch_lightning.utilities import rank_zero_warn, NATIVE_AMP_AVALAIBLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict @@ -448,6 +448,10 @@ def run_training_epoch(self): epoch_output = [] should_check_val = False + # structured result accumulators for callbacks + early_stopping_accumulator = Accumulator() + checkpoint_accumulator = Accumulator() + # run epoch for batch_idx, (batch, is_last_batch) in self.profiler.profile_iterable( enumerate(_with_is_last(train_dataloader)), "get_train_batch" @@ -468,6 +472,12 @@ def run_training_epoch(self): # otherwise we will build up unnecessary memory step_out = batch_output.training_step_output_for_epoch_end should_auto_reduce_train_result = isinstance(step_out, Result) and step_out.should_reduce_on_epoch_end + if 'early_stop_on' in step_out: + early_stopping_accumulator.accumulate(step_out['early_stop_on']) + + if 'checkpoint_on' in step_out: + checkpoint_accumulator.accumulate(step_out['checkpoint_on']) + if self.is_overridden('training_epoch_end', model=self.get_model()) or should_auto_reduce_train_result: epoch_output.append(batch_output.training_step_output_for_epoch_end) @@ -511,7 +521,7 @@ def run_training_epoch(self): self.sync_horovod() # process epoch outputs - self.run_training_epoch_end(epoch_output) + self.run_training_epoch_end(epoch_output, checkpoint_accumulator, early_stopping_accumulator) # checkpoint callback self.check_checkpoint_callback(should_check_val) @@ -548,7 +558,7 @@ def run_on_epoch_end_hook(self, model): if self.is_function_implemented('on_train_epoch_end'): model.on_train_epoch_end() - def run_training_epoch_end(self, epoch_output): + def run_training_epoch_end(self, epoch_output, checkpoint_accumulator, early_stopping_accumulator): model = self.get_model() is_result_obj = len(epoch_output) > 0 and isinstance(epoch_output[0], Result) @@ -556,6 +566,15 @@ def run_training_epoch_end(self, epoch_output): epoch_callback_metrics = {} epoch_progress_bar_metrics = {} + # ----------------------- + # Calculate epoch callback values if given + # ----------------------- + if checkpoint_accumulator.num_values > 0: + epoch_callback_metrics['checkpoint_on'] = checkpoint_accumulator.mean() + + if early_stopping_accumulator.num_values > 0: + epoch_callback_metrics['early_stop_on'] = early_stopping_accumulator.mean() + # -------------------------- # EPOCH END STEP IF DEFINED # -------------------------- @@ -565,26 +584,13 @@ def run_training_epoch_end(self, epoch_output): # remove the protected keys so the user doesn't have to deal with them if is_result_obj: epoch_output = epoch_output[0].__class__.gather(epoch_output) - minimize = epoch_output.minimize - early_stop_on = epoch_output.early_stop_on - checkpoint_on = epoch_output.checkpoint_on - del epoch_output['minimize'] - del epoch_output['early_stop_on'] - del epoch_output['checkpoint_on'] # run training_epoch_end epoch_output = model.training_epoch_end(epoch_output) - # with a result we put back the main metrics and compute means - if isinstance(epoch_output, Result): - epoch_output.minimize = minimize.mean() - epoch_output.early_stop_on = early_stop_on.mean() - epoch_output.checkpoint_on = checkpoint_on.mean() - if isinstance(epoch_output, Result): epoch_log_metrics = epoch_output.epoch_log_metrics epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics - epoch_callback_metrics = epoch_output.callback_metrics else: _processed_outputs = self.process_output(epoch_output) epoch_progress_bar_metrics = _processed_outputs[1] @@ -597,11 +603,8 @@ def run_training_epoch_end(self, epoch_output): elif is_result_obj: epoch_output = epoch_output[0].__class__.reduce_on_epoch_end(epoch_output) epoch_output.minimize = epoch_output.minimize.mean() - epoch_output.early_stop_on = epoch_output.early_stop_on.mean() - epoch_output.checkpoint_on = epoch_output.checkpoint_on.mean() epoch_log_metrics = epoch_output.epoch_log_metrics epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics - epoch_callback_metrics = epoch_output.callback_metrics # -------------------------- # track results @@ -663,6 +666,8 @@ def run_training_batch(self, batch, batch_idx): # track metrics to log batch_log_metrics = [] + using_results_obj = False + if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) @@ -706,7 +711,7 @@ def run_training_batch(self, batch, batch_idx): optimizer, self.hiddens ) - is_result_obj = isinstance(opt_closure_result.training_step_output, Result) + using_results_obj = isinstance(opt_closure_result.training_step_output, Result) # ------------------------------ # POST forward bookkeeping @@ -714,14 +719,14 @@ def run_training_batch(self, batch, batch_idx): batch_callback_metrics.append(opt_closure_result.training_step_output.callback_metrics) # add metrics to loggers - if is_result_obj: + if using_results_obj: metrics_to_log = opt_closure_result.training_step_output.batch_log_metrics else: metrics_to_log = opt_closure_result.training_step_output.log_metrics batch_log_metrics.append(metrics_to_log) # add metrics to progress bar - if is_result_obj: + if using_results_obj: metrics_for_pbar = opt_closure_result.training_step_output.batch_pbar_metrics else: metrics_for_pbar = opt_closure_result.training_step_output.pbar_on_batch_end @@ -764,7 +769,8 @@ def run_training_batch(self, batch, batch_idx): batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} # track all metrics for callbacks - self.callback_metrics.update({k: v for d in batch_callback_metrics for k, v in d.items()}) + if not using_results_obj: + self.callback_metrics.update({k: v for d in batch_callback_metrics for k, v in d.items()}) result = AttributeDict( signal=0, diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index b6fb56f5911c2..86a7bafc54288 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -46,8 +46,8 @@ def test_training_step_result_log_step_only(tmpdir): assert not model.training_epoch_end_called # make sure correct metrics are logged (one per batch step as requested) - assert len(trainer.debug_logged_metrics) == batches - for batch_idx, logged_metrics in enumerate(trainer.debug_logged_metrics): + assert len(trainer.dev_debugger.logged_metrics) == batches + for batch_idx, logged_metrics in enumerate(trainer.dev_debugger.logged_metrics): assert logged_metrics[f'step_log_and_pbar_acc1_b{batch_idx}'] == 11.0 assert logged_metrics[f'step_log_acc2_b{batch_idx}'] == 12.0 assert f'step_pbar_acc3_b{batch_idx}' not in logged_metrics @@ -118,8 +118,8 @@ def test_training_step_result_log_epoch_only(tmpdir): assert not model.training_epoch_end_called # make sure correct metrics are logged (one per batch step as requested) - assert len(trainer.debug_logged_metrics) == epochs - epoch_metrics = trainer.debug_logged_metrics + assert len(trainer.dev_debugger.logged_metrics) == epochs + epoch_metrics = trainer.dev_debugger.logged_metrics assert len(epoch_metrics) == epochs for batch_idx, logged_metrics in enumerate(epoch_metrics): assert logged_metrics[f'epoch_log_and_pbar_acc1_e{batch_idx}'] == 14.0 @@ -191,8 +191,8 @@ def test_training_step_result_log_step_and_epoch(tmpdir): assert not model.training_epoch_end_called # make sure correct metrics are logged (one per batch step as requested) - assert len(trainer.debug_logged_metrics) == (epochs * batches) + epochs - epoch_metrics = trainer.debug_logged_metrics + assert len(trainer.dev_debugger.logged_metrics) == (epochs * batches) + epochs + epoch_metrics = trainer.dev_debugger.logged_metrics epoch_idx = -1 for i_start in range(0, len(epoch_metrics), batches + 1): epoch_idx += 1 @@ -232,7 +232,7 @@ def test_training_step_result_log_step_and_epoch(tmpdir): # VERIFY PBAR METRICS # ------------------------------- # make sure pbar metrics are correct ang log metrics did not leak - all_pbar_metrics = trainer.debug_pbar_added_metrics + all_pbar_metrics = trainer.dev_debugger.pbar_added_metrics assert len(all_pbar_metrics) == (epochs * batches) + epochs epoch_idx = -1 @@ -316,7 +316,7 @@ def test_training_step_epoch_end_result(tmpdir): assert model.training_epoch_end_called # make sure correct metrics were logged - logged_metrics = trainer.debug_logged_metrics + logged_metrics = trainer.dev_debugger.logged_metrics assert len(logged_metrics) == (epochs * batches) + epochs last_logged = logged_metrics[-1] @@ -327,7 +327,7 @@ def test_training_step_epoch_end_result(tmpdir): assert 'epoch_end_pbar_acc' not in last_logged # make sure pbar metrics are correct - logged_pbar = trainer.debug_pbar_added_metrics + logged_pbar = trainer.dev_debugger.pbar_added_metrics assert len(logged_pbar) == (epochs * batches) + epochs assert trainer.progress_bar_metrics['step_epoch_log_and_pbar_acc1'] == 210.0 @@ -425,7 +425,9 @@ def test_use_callbacks_with_train_loop_only(tmpdir): trainer.fit(model) # TODO: finish test to make sure early stopping happened when expected + early_stop_vals = trainer.debug_early_stopping_values all_losses = trainer.debug_saved_losses + assert len(all_losses) == 12 test_use_callbacks_with_train_loop_only('') \ No newline at end of file From ccd08edf81ef60d03111396e8d0c1c13bb2bd4f7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 07:55:05 -0400 Subject: [PATCH 037/168] added hooks --- pytorch_lightning/callbacks/early_stopping.py | 13 ++----------- pytorch_lightning/trainer/logging.py | 4 ++-- pytorch_lightning/trainer/trainer.py | 3 ++- pytorch_lightning/trainer/training_loop.py | 4 +--- tests/trainer/test_trainer_steps_result_return.py | 6 +++--- 5 files changed, 10 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 41a6865e61788..abebaf416fcd9 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -163,17 +163,8 @@ def _run_early_stopping_check(self, trainer, pl_module): current = logs.get(self.monitor) - # track values for dev debugging - if 'PL_DEV_DEBUG' in os.environ: - debug_dict = { - 'epoch': trainer.current_epoch, - 'global_step': trainer.global_step, - 'rank': trainer.global_rank, - 'current': current, - 'best': self.best_score, - 'patience': self.wait_count - } - trainer.debug_early_stopping_values.append(debug_dict) + # when in dev debugging + trainer.dev_debugger.track_early_stopping_history() if not isinstance(current, torch.Tensor): current = torch.tensor(current, device=pl_module.device) diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index e7c9d7636d9ae..96dd0d028be8d 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -74,7 +74,7 @@ def log_metrics(self, metrics, grad_norm_dic, step=None): self.logger.agg_and_log_metrics(scalar_metrics, step=step) self.logger.save() - self.dev_debugger.track_logged_metrics(scalar_metrics) + self.dev_debugger.track_logged_metrics_history(scalar_metrics) def add_progress_bar_metrics(self, metrics): for k, v in metrics.items(): @@ -83,7 +83,7 @@ def add_progress_bar_metrics(self, metrics): self.progress_bar_metrics[k] = v - self.dev_debugger.track_pbar_metrics(self, metrics) + self.dev_debugger.track_pbar_metrics_history(self, metrics) def metrics_to_scalars(self, metrics): new_metrics = {} diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4b2ccd09a6b60..76c7899508144 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -617,7 +617,8 @@ def __init__( self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') - self.dev_debugger = InternalDebugger() + # tracks internal state for debugging + self.dev_debugger = InternalDebugger(self) # Callback system self.on_init_end() diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 237286d7245af..a0444b62ff02d 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -951,9 +951,7 @@ def optimizer_closure(self, split_batch, batch_idx, opt_idx, optimizer, hiddens) model_ref.on_after_backward() # when in dev debugging track the losses - if 'PL_DEV_DEBUG' in os.environ: - loss_dict = {'batch_idx': batch_idx, 'epoch': self.current_epoch, 'loss': untouched_loss.detach()} - self.debug_saved_losses.append(loss_dict) + self.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) result = AttributeDict( loss=untouched_loss, diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 86a7bafc54288..3901d22f49e61 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -385,7 +385,7 @@ def test_no_auto_callbacks_with_train_loop_only(tmpdir): ) trainer.fit(model) - all_losses = trainer.debug_saved_losses + all_losses = trainer.dev_debugger.saved_losses assert len(all_losses) == batches * epochs assert trainer.checkpoint_callback.monitor == 'checkpoint_on' @@ -425,8 +425,8 @@ def test_use_callbacks_with_train_loop_only(tmpdir): trainer.fit(model) # TODO: finish test to make sure early stopping happened when expected - early_stop_vals = trainer.debug_early_stopping_values - all_losses = trainer.debug_saved_losses + early_stop_vals = trainer.dev_debugger.early_stopping_history + all_losses = trainer.dev_debugger.saved_losses assert len(all_losses) == 12 From 804e9c83cec39b931cfdd5d76a657d23a55f2836 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 08:01:41 -0400 Subject: [PATCH 038/168] added hooks --- pytorch_lightning/callbacks/early_stopping.py | 2 +- pytorch_lightning/trainer/logging.py | 2 +- tests/trainer/test_trainer_steps_result_return.py | 11 ++++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index abebaf416fcd9..7035159e990ff 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -164,7 +164,7 @@ def _run_early_stopping_check(self, trainer, pl_module): current = logs.get(self.monitor) # when in dev debugging - trainer.dev_debugger.track_early_stopping_history() + trainer.dev_debugger.track_early_stopping_history(current) if not isinstance(current, torch.Tensor): current = torch.tensor(current, device=pl_module.device) diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index 96dd0d028be8d..3baed4ef9d81d 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -83,7 +83,7 @@ def add_progress_bar_metrics(self, metrics): self.progress_bar_metrics[k] = v - self.dev_debugger.track_pbar_metrics_history(self, metrics) + self.dev_debugger.track_pbar_metrics_history(metrics) def metrics_to_scalars(self, metrics): new_metrics = {} diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 3901d22f49e61..2469a6f450b55 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -54,7 +54,6 @@ def test_training_step_result_log_step_only(tmpdir): assert len(logged_metrics) == 3 # make sure we are using the correct metrics for callbacks - assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 # make sure pbar metrics are correct ang log metrics did not leak @@ -128,7 +127,6 @@ def test_training_step_result_log_epoch_only(tmpdir): assert len(logged_metrics) == 3 # make sure we are using the correct metrics for callbacks - assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 # make sure pbar metrics are correct ang log metrics did not leak @@ -225,7 +223,6 @@ def test_training_step_result_log_step_and_epoch(tmpdir): assert len(logged_metrics) == 3 # make sure we are using the correct metrics for callbacks - assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 # ------------------------------- @@ -338,7 +335,6 @@ def test_training_step_epoch_end_result(tmpdir): assert 'log_acc2' not in trainer.progress_bar_metrics # make sure callback metrics didn't change - assert trainer.callback_metrics['early_stop_on'] == 171 assert trainer.callback_metrics['checkpoint_on'] == 171 # ----------------------------------------- @@ -428,6 +424,11 @@ def test_use_callbacks_with_train_loop_only(tmpdir): early_stop_vals = trainer.dev_debugger.early_stopping_history all_losses = trainer.dev_debugger.saved_losses - assert len(all_losses) == 12 + # assert len(all_losses) == 12 +test_training_step_result_log_step_only('') +test_training_step_result_log_epoch_only('') +test_training_step_result_log_step_and_epoch('') +test_training_step_epoch_end_result('') +test_no_auto_callbacks_with_train_loop_only('') test_use_callbacks_with_train_loop_only('') \ No newline at end of file From 2736c7062fc7c3192f952ff149570f8738c201a9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 08:24:16 -0400 Subject: [PATCH 039/168] finished tests for structured results on train epoch --- .../callbacks/model_checkpoint.py | 16 +++--- pytorch_lightning/utilities/debugging.py | 53 +++++++++++++++++++ tests/base/deterministic_model.py | 4 +- .../test_trainer_steps_result_return.py | 32 +++++++---- 4 files changed, 88 insertions(+), 17 deletions(-) create mode 100644 pytorch_lightning/utilities/debugging.py diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index d3e8544f85217..eb81e879c7d56 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -159,7 +159,11 @@ def _del_model(self, filepath): if os.path.isfile(filepath): os.remove(filepath) - def _save_model(self, filepath): + def _save_model(self, filepath, trainer, pl_module): + + # in debugging, track when we save checkpoints + trainer.dev_debugger.track_checkpointing_history(filepath) + # make paths os.makedirs(os.path.dirname(filepath), exist_ok=True) @@ -286,7 +290,7 @@ def on_validation_end(self, trainer, pl_module): if self.save_last: filepath = os.path.join(self.dirpath, self.prefix + 'last.ckpt') - self._save_model(filepath) + self._save_model(filepath, trainer, pl_module) filepath = self.format_checkpoint_name(epoch, metrics) version_cnt = 0 @@ -311,7 +315,7 @@ def on_validation_end(self, trainer, pl_module): f'Can save best model only with {self.monitor} available, skipping.', RuntimeWarning ) elif self.check_monitor_top_k(current): - self._do_check_save(filepath, current, epoch) + self._do_check_save(filepath, current, epoch, trainer, pl_module) elif self.verbose > 0: log.info(f'\nEpoch {epoch:05d}: {self.monitor} was not in top {self.save_top_k}') @@ -320,9 +324,9 @@ def on_validation_end(self, trainer, pl_module): log.info(f'\nEpoch {epoch:05d}: saving model to {filepath}') assert trainer.global_rank == 0, 'tried to make a checkpoint from non global_rank=0' - self._save_model(filepath) + self._save_model(filepath, trainer, pl_module) - def _do_check_save(self, filepath, current, epoch): + def _do_check_save(self, filepath, current, epoch, trainer, pl_module): # remove kth del_list = [] @@ -348,7 +352,7 @@ def _do_check_save(self, filepath, current, epoch): f'\nEpoch {epoch:05d}: {self.monitor} reached' f' {current:0.5f} (best {self.best_model_score:0.5f}), saving model to' f' {filepath} as top {self.save_top_k}') - self._save_model(filepath) + self._save_model(filepath, trainer, pl_module) for cur_path in del_list: if cur_path != filepath: diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py new file mode 100644 index 0000000000000..d8a7722fd8884 --- /dev/null +++ b/pytorch_lightning/utilities/debugging.py @@ -0,0 +1,53 @@ +import os + + +class InternalDebugger(object): + + def __init__(self, trainer): + + self.enabled = 'PL_DEV_DEBUG' in os.environ + self.trainer = trainer + self.logged_metrics = [] + self.pbar_added_metrics = [] + self.saved_losses = [] + self.early_stopping_history = [] + self.checkpoint_callback_history = [] + + def track_logged_metrics_history(self, scalar_metrics): + if self.enabled: + self.logged_metrics.append(scalar_metrics) + + def track_train_loss_history(self, batch_idx, loss): + if self.enabled: + loss_dict = {'batch_idx': batch_idx, 'epoch': self.trainer.current_epoch, 'loss': loss.detach()} + self.saved_losses.append(loss_dict) + + def track_pbar_metrics_history(self, metrics): + if self.enabled: + metrics['debug_epoch'] = self.trainer.current_epoch + self.pbar_added_metrics.append(metrics) + + def track_early_stopping_history(self, current): + if self.enabled: + es = self.trainer.early_stop_callback + debug_dict = { + 'epoch': self.trainer.current_epoch, + 'global_step': self.trainer.global_step, + 'rank': self.trainer.global_rank, + 'current': current, + 'best': es.best_score, + 'patience': es.wait_count + } + self.early_stopping_history.append(debug_dict) + + def track_checkpointing_history(self, filepath): + if self.enabled: + cb = self.trainer.checkpoint_callback + debug_dict = { + 'epoch': self.trainer.current_epoch, + 'global_step': self.trainer.global_step, + 'monitor': cb.monitor, + 'rank': self.trainer.global_rank, + 'filepath': filepath + } + self.checkpoint_callback_history.append(debug_dict) \ No newline at end of file diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 46c0c31039793..4da5da9c977f7 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -120,8 +120,8 @@ def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx acc = self.step(batch, batch_idx) self.assert_backward = False - losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20, 22, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30] - idx = batch_idx + (self.current_epoch * 3) + losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20] + idx = self.current_epoch loss = acc + losses[idx] result = TrainResult(minimize=loss, early_stop_on=loss, checkpoint_on=loss) return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 2469a6f450b55..c8459f47e2c25 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -420,15 +420,29 @@ def test_use_callbacks_with_train_loop_only(tmpdir): ) trainer.fit(model) - # TODO: finish test to make sure early stopping happened when expected + num_expected_epochs = 10 + + # ---------------------------------- + # VERIFY EARLY STOPPING BEHAVIOR + # ---------------------------------- + # with train loop only it happens on every epoch early_stop_vals = trainer.dev_debugger.early_stopping_history + assert len(early_stop_vals) == num_expected_epochs + min_val = min([x['best'] for x in early_stop_vals]) + assert min_val == 171 + 9 all_losses = trainer.dev_debugger.saved_losses - # assert len(all_losses) == 12 - -test_training_step_result_log_step_only('') -test_training_step_result_log_epoch_only('') -test_training_step_result_log_step_and_epoch('') -test_training_step_epoch_end_result('') -test_no_auto_callbacks_with_train_loop_only('') -test_use_callbacks_with_train_loop_only('') \ No newline at end of file + from collections import Counter + batch_idxs = Counter([x['batch_idx'] for x in all_losses]) + for i, val in batch_idxs.items(): + assert val == num_expected_epochs + assert i in [0, 1, 2] + + # ---------------------------------- + # VERIFY CHECKPOINTING BEHAVIOR + # ---------------------------------- + ckpt_vals = trainer.dev_debugger.checkpoint_callback_history + assert len(ckpt_vals) == 5, '5 ckpts should have been saved' + for ckpt_val, expected_epoch in zip(ckpt_vals, [0, 1, 2, 3, 6]): + assert ckpt_val['epoch'] == expected_epoch + assert ckpt_val['monitor'] == 'checkpoint_on' From e09bcfc5f735a8b4e792162a52ca494119488605 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 08:40:58 -0400 Subject: [PATCH 040/168] finished tests for structured results on train epoch --- pytorch_lightning/core/step_result.py | 12 +-- tests/base/deterministic_model.py | 10 +++ .../test_trainer_steps_result_return.py | 81 ++++++++++++++++++- 3 files changed, 95 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 117e1ea45a325..abbe5e42676a4 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict +from typing import Optional, Dict, Union from torch import Tensor import torch from copy import copy @@ -10,7 +10,7 @@ def __init__( self, minimize: Optional[Tensor] = None, early_stop_on: Tensor = None, - checkpoint_on: Tensor = None, + checkpoint_on: Union[Tensor, bool] = None, hiddens: Optional[Tensor] = None ): @@ -18,7 +18,7 @@ def __init__( if early_stop_on is not None: self.early_stop_on = early_stop_on - if checkpoint_on is not None: + if checkpoint_on is not None and checkpoint_on: self.checkpoint_on = checkpoint_on if hiddens is not None: self.hiddens = hiddens @@ -57,7 +57,7 @@ def __setattr__(self, key, val): # ensure reserve keys are tensors and detached if key in {'hiddens', 'checkpoint_on', 'early_stop_on'}: self._assert_tensor_metric(key, val) - if val is not None: + if val is not None and isinstance(val, torch.Tensor): val = val.detach() # ensure anything else that is a tensor is detached @@ -67,7 +67,7 @@ def __setattr__(self, key, val): self[key] = val def _assert_tensor_metric(self, name, x): - if x is not None: + if x is not None and not isinstance(x, bool): assert isinstance(x, Tensor), f'{name} must be a torch.Tensor' def _assert_grad_tensor_metric(self, name, x, additional_err: str = None): @@ -269,7 +269,7 @@ def __init__( self, minimize: Optional[Tensor] = None, early_stop_on: Tensor = None, - checkpoint_on: Tensor = None, + checkpoint_on: Union[Tensor, bool] = None, hiddens: Optional[Tensor] = None ): diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 4da5da9c977f7..4e46dd1dd39d5 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -113,6 +113,16 @@ def training_step_no_default_callbacks_for_train_loop(self, batch, batch_idx): assert 'checkpoint_on' in result return result + def training_step_no_callbacks_result_obj(self, batch, batch_idx): + """ + Early stop and checkpoint only on these values + """ + acc = self.step(batch, batch_idx) + result = TrainResult(minimize=acc, checkpoint_on=False) + assert 'early_step_on' not in result + assert 'checkpoint_on' not in result + return result + def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx): """ Early stop and checkpoint only on these values diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index c8459f47e2c25..1b5fabcc980d3 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -8,8 +8,6 @@ from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult -# TODOs: -# make checkpoint and early stopping use the correct metrics # test with train_step_end # add logging + row interval tests @@ -400,6 +398,37 @@ def test_no_auto_callbacks_with_train_loop_only(tmpdir): assert trainer.early_stop_callback.monitor == 'val_loss' +def test_no_callbacks_with_train_loop_only(tmpdir): + """ + Make sure early stop + checkpoint work with only a train loop + """ + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_no_callbacks_result_obj + model.training_epoch_end = None + model.val_dataloader = None + + batches = 3 + epochs = 3 + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=epochs, + row_log_interval=1, + limit_train_batches=batches, + weights_summary=None, + ) + trainer.fit(model) + + all_losses = trainer.dev_debugger.saved_losses + assert len(all_losses) == batches * epochs + + assert trainer.early_stop_callback is None + + assert len(trainer.dev_debugger.checkpoint_callback_history) == 0 + assert len(trainer.dev_debugger.early_stopping_history) == 0 + + def test_use_callbacks_with_train_loop_only(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' @@ -446,3 +475,51 @@ def test_use_callbacks_with_train_loop_only(tmpdir): for ckpt_val, expected_epoch in zip(ckpt_vals, [0, 1, 2, 3, 6]): assert ckpt_val['epoch'] == expected_epoch assert ckpt_val['monitor'] == 'checkpoint_on' + + +def test_full_train_loop_with_results_obj(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + model = DeterministicModel() + model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks + model.training_epoch_end = None + model.val_dataloader = None + + batches = 3 + epochs = 300 + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=epochs, + early_stop_callback=True, + row_log_interval=1, + limit_train_batches=batches, + weights_summary=None, + ) + trainer.fit(model) + + num_expected_epochs = 10 + + # ---------------------------------- + # VERIFY EARLY STOPPING BEHAVIOR + # ---------------------------------- + # with train loop only it happens on every epoch + early_stop_vals = trainer.dev_debugger.early_stopping_history + assert len(early_stop_vals) == num_expected_epochs + min_val = min([x['best'] for x in early_stop_vals]) + assert min_val == 171 + 9 + all_losses = trainer.dev_debugger.saved_losses + + from collections import Counter + batch_idxs = Counter([x['batch_idx'] for x in all_losses]) + for i, val in batch_idxs.items(): + assert val == num_expected_epochs + assert i in [0, 1, 2] + + # ---------------------------------- + # VERIFY CHECKPOINTING BEHAVIOR + # ---------------------------------- + ckpt_vals = trainer.dev_debugger.checkpoint_callback_history + assert len(ckpt_vals) == 5, '5 ckpts should have been saved' + for ckpt_val, expected_epoch in zip(ckpt_vals, [0, 1, 2, 3, 6]): + assert ckpt_val['epoch'] == expected_epoch + assert ckpt_val['monitor'] == 'checkpoint_on' From 55eb02c28f35f19f70e7def5e948eaf27a2cea01 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 08:56:49 -0400 Subject: [PATCH 041/168] finished tests for structured results on train epoch --- pytorch_lightning/core/step_result.py | 20 +++++++++----- pytorch_lightning/trainer/training_loop.py | 1 + tests/base/deterministic_model.py | 26 +++++++++++++++++++ .../test_trainer_steps_result_return.py | 12 ++++++--- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index abbe5e42676a4..f3ca6b970cd22 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -87,10 +87,16 @@ def log( logger=True, on_step=False, on_epoch=True, - reduce_fx=torch.mean + reduce_fx=torch.mean, + enable_graph=False, ): + # no metrics should be logged with graphs + if not enable_graph and isinstance(value, torch.Tensor): + value = value.detach() + if 'meta' not in self: self.__setitem__('meta', {}) + self.__set_meta(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx) # set the value @@ -99,8 +105,6 @@ def log( def __set_meta(self, name, value, prog_bar, logger, on_step, on_epoch, reduce_fx): # set the meta for the item meta_value = value - if isinstance(meta_value, torch.Tensor): - meta_value = meta_value.detach() meta = dict( prog_bar=prog_bar, logger=logger, @@ -283,9 +287,10 @@ def log( logger=True, on_step=True, on_epoch=False, - reduce_fx=torch.mean + reduce_fx=torch.mean, + enable_graph=False, ): - super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx) + super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) class EvalResult(Result): @@ -307,9 +312,10 @@ def log( logger=True, on_step=False, on_epoch=True, - reduce_fx=torch.mean + reduce_fx=torch.mean, + enable_graph=False, ): - super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx) + super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) if __name__ == '__main__': diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index a0444b62ff02d..bca2ec8dbb943 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1075,6 +1075,7 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): if self.is_overridden('training_step_end'): model_ref = self.get_model() with self.profiler.profile('training_step_end'): + # TODO: modify when using result obj output = model_ref.training_step_end(output) # allow any mode to define training_end diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 4e46dd1dd39d5..f7164af3fce4e 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -103,6 +103,32 @@ def training_epoch_end_scalar(self, outputs): # -------------------------- # Result returns # -------------------------- + def training_step_full_loop_result_obj(self, batch, batch_idx): + """ + Full loop flow train step + """ + acc = self.step(batch, batch_idx) + result = TrainResult(minimize=acc) + result.log('train_step_acc1', acc + 1) + self.training_step_called = True + return result + + def training_step_end_full_loop_result_obj_dp(self, result): + """ + Full loop flow train step + """ + result.log('train_step_end_acc1', 1) + self.training_step_end_called = True + return result + + def training_epoch_end_full_loop_result_obj(self, result): + """ + Full loop flow train step + """ + result.log('train_epoch_end_acc1', 1) + self.training_epoch_end_called = True + return result + def training_step_no_default_callbacks_for_train_loop(self, batch, batch_idx): """ Early stop and checkpoint only on these values diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 1b5fabcc980d3..9921c8907d731 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -477,18 +477,20 @@ def test_use_callbacks_with_train_loop_only(tmpdir): assert ckpt_val['monitor'] == 'checkpoint_on' -def test_full_train_loop_with_results_obj(tmpdir): +def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = DeterministicModel() - model.training_step = model.training_step_result_log_epoch_and_step_for_callbacks - model.training_epoch_end = None + model.training_step = model.training_step_full_loop_result_obj + model.training_step_end = model.training_step_end_full_loop_result_obj_dp + model.training_epoch_end = model.training_epoch_end_full_loop_result_obj model.val_dataloader = None batches = 3 - epochs = 300 + epochs = 3 trainer = Trainer( default_root_dir=tmpdir, + distributed_backend='dp', max_epochs=epochs, early_stop_callback=True, row_log_interval=1, @@ -523,3 +525,5 @@ def test_full_train_loop_with_results_obj(tmpdir): for ckpt_val, expected_epoch in zip(ckpt_vals, [0, 1, 2, 3, 6]): assert ckpt_val['epoch'] == expected_epoch assert ckpt_val['monitor'] == 'checkpoint_on' + +test_full_train_loop_with_results_obj_dp('') \ No newline at end of file From b13d62b33c909a9458770a07a73a87e3b2c47b81 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 08:58:42 -0400 Subject: [PATCH 042/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 + tests/trainer/test_trainer_steps_result_return.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index f7164af3fce4e..2b9de7329dfbe 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -117,6 +117,7 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step """ + import pdb; pdb.set_trace() result.log('train_step_end_acc1', 1) self.training_step_end_called = True return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 9921c8907d731..e7859dca7f937 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -497,6 +497,8 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): limit_train_batches=batches, weights_summary=None, ) + + import pdb; pdb.set_trace() trainer.fit(model) num_expected_epochs = 10 From 70061387777b77e6041d9a20c92b2ed7390fd4cf Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:05:17 -0400 Subject: [PATCH 043/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 + tests/trainer/test_trainer_steps_result_return.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 2b9de7329dfbe..ea0bb2c1d2790 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -42,6 +42,7 @@ def step(self, batch, batch_idx): assert torch.all(test_hat[:, 0] == 15.0) assert torch.all(test_hat[:, 1] == 42.0) out = y_hat.sum() + import pdb; pdb.set_trace() assert out == (42.0 * 3) + (15.0 * 3) return out diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index e7859dca7f937..27e10f7d59210 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -491,6 +491,7 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer = Trainer( default_root_dir=tmpdir, distributed_backend='dp', + gpus=2, max_epochs=epochs, early_stop_callback=True, row_log_interval=1, From 715a634a08ef3d2aeab40892bda8f3ab41265f3d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:07:15 -0400 Subject: [PATCH 044/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index ea0bb2c1d2790..76f4eeb6f7142 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -6,6 +6,23 @@ from pytorch_lightning.core.lightning import LightningModule +import sys +import pdb + + +class ForkedPdb(pdb.Pdb): + """A Pdb subclass that may be used + from a forked multiprocessing child + """ + + def interaction(self, *args, **kwargs): + _stdin = sys.stdin + try: + sys.stdin = open('/dev/stdin') + pdb.Pdb.interaction(self, *args, **kwargs) + finally: + sys.stdin = _stdin + class DeterministicModel(LightningModule): @@ -42,7 +59,7 @@ def step(self, batch, batch_idx): assert torch.all(test_hat[:, 0] == 15.0) assert torch.all(test_hat[:, 1] == 42.0) out = y_hat.sum() - import pdb; pdb.set_trace() + ForkedPdb().set_trace() assert out == (42.0 * 3) + (15.0 * 3) return out @@ -118,7 +135,7 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step """ - import pdb; pdb.set_trace() + ForkedPdb().set_trace() result.log('train_step_end_acc1', 1) self.training_step_end_called = True return result From c26a92ec2f7e87bf8374081188f436ba3d424b41 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:08:22 -0400 Subject: [PATCH 045/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 76f4eeb6f7142..36cf1aef72ef0 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -60,6 +60,7 @@ def step(self, batch, batch_idx): assert torch.all(test_hat[:, 1] == 42.0) out = y_hat.sum() ForkedPdb().set_trace() + print(out) assert out == (42.0 * 3) + (15.0 * 3) return out From e59b04cbc2f0415e0ac0c3738d1e55348bbacebc Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:10:50 -0400 Subject: [PATCH 046/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 36cf1aef72ef0..281414874e739 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -55,11 +55,11 @@ def step(self, batch, batch_idx): x = batch y_hat = self(x) + ForkedPdb().set_trace() test_hat = y_hat.cpu().detach() assert torch.all(test_hat[:, 0] == 15.0) assert torch.all(test_hat[:, 1] == 42.0) out = y_hat.sum() - ForkedPdb().set_trace() print(out) assert out == (42.0 * 3) + (15.0 * 3) From a90a719d84109e46a710c201b6a78231ba10e47c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:12:01 -0400 Subject: [PATCH 047/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 281414874e739..232d58871a55f 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -55,7 +55,6 @@ def step(self, batch, batch_idx): x = batch y_hat = self(x) - ForkedPdb().set_trace() test_hat = y_hat.cpu().detach() assert torch.all(test_hat[:, 0] == 15.0) assert torch.all(test_hat[:, 1] == 42.0) From 43b372454e2678556c4e7016837d8c1ba6ab83ec Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:13:27 -0400 Subject: [PATCH 048/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 232d58871a55f..b433750342679 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -53,6 +53,7 @@ def forward(self, x): def step(self, batch, batch_idx): x = batch + bs = x.size(0) y_hat = self(x) test_hat = y_hat.cpu().detach() @@ -60,7 +61,7 @@ def step(self, batch, batch_idx): assert torch.all(test_hat[:, 1] == 42.0) out = y_hat.sum() print(out) - assert out == (42.0 * 3) + (15.0 * 3) + assert out == (42.0 * bs) + (15.0 * bs) return out From a45b8088de1fb85cc8a6144648f9f69b52f8e7ea Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:15:13 -0400 Subject: [PATCH 049/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + tests/base/deterministic_model.py | 18 ------------------ 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index f2a23b188e068..bb147d385ce54 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -63,6 +63,7 @@ def forward(self, *inputs, **kwargs): replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) outputs = self.parallel_apply(replicas, inputs, kwargs) + import pdb; pdb.set_trace() return self.gather(outputs, self.output_device) def parallel_apply(self, replicas, inputs, kwargs): diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index b433750342679..e8caaf9c92be2 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -6,23 +6,6 @@ from pytorch_lightning.core.lightning import LightningModule -import sys -import pdb - - -class ForkedPdb(pdb.Pdb): - """A Pdb subclass that may be used - from a forked multiprocessing child - """ - - def interaction(self, *args, **kwargs): - _stdin = sys.stdin - try: - sys.stdin = open('/dev/stdin') - pdb.Pdb.interaction(self, *args, **kwargs) - finally: - sys.stdin = _stdin - class DeterministicModel(LightningModule): @@ -60,7 +43,6 @@ def step(self, batch, batch_idx): assert torch.all(test_hat[:, 0] == 15.0) assert torch.all(test_hat[:, 1] == 42.0) out = y_hat.sum() - print(out) assert out == (42.0 * bs) + (15.0 * bs) return out From c50c74e631bd61aaa1cd8812095346b6230f26d4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:17:38 -0400 Subject: [PATCH 050/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index bb147d385ce54..d1a8a36590eac 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -63,6 +63,7 @@ def forward(self, *inputs, **kwargs): replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) outputs = self.parallel_apply(replicas, inputs, kwargs) + outputs = [{'a': x['minimize']} for x in outputs] import pdb; pdb.set_trace() return self.gather(outputs, self.output_device) From 758b5d86d76707da32d94547b6eb6f918e310c10 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:22:58 -0400 Subject: [PATCH 051/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 34 ++++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index d1a8a36590eac..0176c67265904 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -6,6 +6,7 @@ from torch.cuda._utils import _get_device_index from torch.nn import DataParallel from torch.nn.parallel import DistributedDataParallel +from pytorch_lightning.core.step_result import Result def _find_tensors(obj): # pragma: no-cover @@ -63,9 +64,36 @@ def forward(self, *inputs, **kwargs): replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) outputs = self.parallel_apply(replicas, inputs, kwargs) - outputs = [{'a': x['minimize']} for x in outputs] - import pdb; pdb.set_trace() - return self.gather(outputs, self.output_device) + + if isinstance(outputs[0], Result): + import pdb; pdb.set_trace() + outputs = self.__gather_structured_result(outputs) + else: + outputs = self.gather(outputs, self.output_device) + return outputs + + def __gather_structured_result(self, outputs): + prototype_output = outputs[0] + original_class = prototype_output.__class__ + outputs = [dict(x) for x in outputs] + + # functions cannot be reduced... delete from each output and track so we can add back + reduce_fxs = {k: prototype_output[k] for k in prototype_output.keys() if 'reduce_fx' in k} + for i, output in enumerate(outputs): + for k in reduce_fxs.keys(): + del output[k] + + outputs = self.gather(outputs, self.output_device) + + # pass minimize to constructor for TrainResult + if 'minimize' in outputs: + result = original_class(outputs['minimize']) + else: + result = original_class() + + result.update(outputs) + result.update(reduce_fxs) + return result def parallel_apply(self, replicas, inputs, kwargs): return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) From 98123180d3b81d3f186df79b5cf5da40ae6ec042 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:24:11 -0400 Subject: [PATCH 052/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 0176c67265904..40276b4920586 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -66,7 +66,6 @@ def forward(self, *inputs, **kwargs): outputs = self.parallel_apply(replicas, inputs, kwargs) if isinstance(outputs[0], Result): - import pdb; pdb.set_trace() outputs = self.__gather_structured_result(outputs) else: outputs = self.gather(outputs, self.output_device) @@ -83,6 +82,7 @@ def __gather_structured_result(self, outputs): for k in reduce_fxs.keys(): del output[k] + import pdb; pdb.set_trace() outputs = self.gather(outputs, self.output_device) # pass minimize to constructor for TrainResult From c1af2221eea16beabb4745a59ac0548aa4ebab74 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:25:43 -0400 Subject: [PATCH 053/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 40276b4920586..a4a59b29cda16 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -77,10 +77,9 @@ def __gather_structured_result(self, outputs): outputs = [dict(x) for x in outputs] # functions cannot be reduced... delete from each output and track so we can add back - reduce_fxs = {k: prototype_output[k] for k in prototype_output.keys() if 'reduce_fx' in k} + meta = outputs[0].meta for i, output in enumerate(outputs): - for k in reduce_fxs.keys(): - del output[k] + del output['meta'] import pdb; pdb.set_trace() outputs = self.gather(outputs, self.output_device) From 07c4f42d4c3d9c68a9c71cf4981231696f8a45b5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:26:23 -0400 Subject: [PATCH 054/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index a4a59b29cda16..492d227dbf956 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -77,7 +77,7 @@ def __gather_structured_result(self, outputs): outputs = [dict(x) for x in outputs] # functions cannot be reduced... delete from each output and track so we can add back - meta = outputs[0].meta + meta = outputs[0]['meta'] for i, output in enumerate(outputs): del output['meta'] From 28f2c40f515fb309cbf51974bdc3b4d531f219ee Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:28:05 -0400 Subject: [PATCH 055/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 492d227dbf956..282c66fd09880 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -76,14 +76,14 @@ def __gather_structured_result(self, outputs): original_class = prototype_output.__class__ outputs = [dict(x) for x in outputs] - # functions cannot be reduced... delete from each output and track so we can add back + # remove all the meta info meta = outputs[0]['meta'] for i, output in enumerate(outputs): del output['meta'] - import pdb; pdb.set_trace() outputs = self.gather(outputs, self.output_device) + import pdb; pdb.set_trace() # pass minimize to constructor for TrainResult if 'minimize' in outputs: result = original_class(outputs['minimize']) @@ -91,7 +91,7 @@ def __gather_structured_result(self, outputs): result = original_class() result.update(outputs) - result.update(reduce_fxs) + result['meta'] = meta return result def parallel_apply(self, replicas, inputs, kwargs): From 15c8f55d6a32d9de91a579ac55c4ce6bceed0b3b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:29:56 -0400 Subject: [PATCH 056/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 282c66fd09880..e4e29517b41d1 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -83,7 +83,6 @@ def __gather_structured_result(self, outputs): outputs = self.gather(outputs, self.output_device) - import pdb; pdb.set_trace() # pass minimize to constructor for TrainResult if 'minimize' in outputs: result = original_class(outputs['minimize']) From f8209b22139d4588efd2773167ab58525224cbfc Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:30:44 -0400 Subject: [PATCH 057/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index e8caaf9c92be2..4070019131c08 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -3,6 +3,7 @@ from torch import nn from torch.utils.data import Dataset, DataLoader from pytorch_lightning import TrainResult +import pdb from pytorch_lightning.core.lightning import LightningModule @@ -118,7 +119,7 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step """ - ForkedPdb().set_trace() + pdb.set_trace() result.log('train_step_end_acc1', 1) self.training_step_end_called = True return result From 882437e00218abf714b8ee4df8d67a6321d3a2c7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:32:31 -0400 Subject: [PATCH 058/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 4070019131c08..a3e4e71a9b53e 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -119,9 +119,11 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step """ - pdb.set_trace() + result.minimize = result.minimize.mean() + result.checkpoint_on = result.checkpoint_on.mean() result.log('train_step_end_acc1', 1) self.training_step_end_called = True + import pdb; pdb.set_trace() return result def training_epoch_end_full_loop_result_obj(self, result): From 874f4a27e25d5f4e35e98065ab8a5051fa624aee Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:33:25 -0400 Subject: [PATCH 059/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index a3e4e71a9b53e..1473434a58d6c 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -121,6 +121,7 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() + result.train_step_acc1 = result.train_step_acc1.mean() result.log('train_step_end_acc1', 1) self.training_step_end_called = True import pdb; pdb.set_trace() From 968b17e5e55bc456d85fa181cddbbcba4b98ba89 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:34:46 -0400 Subject: [PATCH 060/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 1473434a58d6c..e6e2fb46da366 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -119,12 +119,13 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step """ + self.assert_backward = False + result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() result.train_step_acc1 = result.train_step_acc1.mean() result.log('train_step_end_acc1', 1) self.training_step_end_called = True - import pdb; pdb.set_trace() return result def training_epoch_end_full_loop_result_obj(self, result): From 176b884362906e23136abfa2b778e20360e9e144 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:35:11 -0400 Subject: [PATCH 061/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 27e10f7d59210..2f4e321a1bf54 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -499,8 +499,8 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): weights_summary=None, ) - import pdb; pdb.set_trace() trainer.fit(model) + import pdb; pdb.set_trace() num_expected_epochs = 10 From 7b4be6aa37f904ca62fa523559259e3eff74b938 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:36:06 -0400 Subject: [PATCH 062/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index e4e29517b41d1..3bd89a21122a4 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -81,6 +81,7 @@ def __gather_structured_result(self, outputs): for i, output in enumerate(outputs): del output['meta'] + import pdb; pdb.set_trace() outputs = self.gather(outputs, self.output_device) # pass minimize to constructor for TrainResult From 92c0323904d972d7ac0728d0500ec8188bd9845d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:37:55 -0400 Subject: [PATCH 063/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 - tests/base/deterministic_model.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 3bd89a21122a4..e4e29517b41d1 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -81,7 +81,6 @@ def __gather_structured_result(self, outputs): for i, output in enumerate(outputs): del output['meta'] - import pdb; pdb.set_trace() outputs = self.gather(outputs, self.output_device) # pass minimize to constructor for TrainResult diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index e6e2fb46da366..236694935567b 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -365,6 +365,8 @@ def backward(self, trainer, loss, optimizer, optimizer_idx): assert loss > 171 * 1000 else: assert loss == 171.0 + + import pdb; pdb.set_trace() loss.backward() From 8bb3b194b9cbaa4047c0902ee30d1c0684abc8c6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:42:13 -0400 Subject: [PATCH 064/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 236694935567b..dac6316cf3ce2 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -37,6 +37,7 @@ def forward(self, x): def step(self, batch, batch_idx): x = batch + print(x.device) bs = x.size(0) y_hat = self(x) From 1c69301965b9e11c976d23c26a748b1c2e560242 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:43:00 -0400 Subject: [PATCH 065/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index dac6316cf3ce2..236694935567b 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -37,7 +37,6 @@ def forward(self, x): def step(self, batch, batch_idx): x = batch - print(x.device) bs = x.size(0) y_hat = self(x) From 4a6f193097c31cda15d96455146319430b420f44 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:43:40 -0400 Subject: [PATCH 066/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 236694935567b..3fca21d5cab16 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -122,6 +122,7 @@ def training_step_end_full_loop_result_obj_dp(self, result): self.assert_backward = False result.minimize = result.minimize.mean() + import pdb; pdb.set_trace() result.checkpoint_on = result.checkpoint_on.mean() result.train_step_acc1 = result.train_step_acc1.mean() result.log('train_step_end_acc1', 1) From 44d9a0a43e22bf5abca541a758a89c738d1d6708 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:44:34 -0400 Subject: [PATCH 067/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 3fca21d5cab16..e6b6084f1f095 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -121,8 +121,8 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ self.assert_backward = False - result.minimize = result.minimize.mean() import pdb; pdb.set_trace() + result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() result.train_step_acc1 = result.train_step_acc1.mean() result.log('train_step_end_acc1', 1) From 26c8d3c1dcfce4f494490c76a19d817d37a9f9f5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:47:48 -0400 Subject: [PATCH 068/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index e6b6084f1f095..b9ce1d1d3f41d 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -39,6 +39,8 @@ def step(self, batch, batch_idx): x = batch bs = x.size(0) y_hat = self(x) + print(self.device) + print(self.l1.weight.device) test_hat = y_hat.cpu().detach() assert torch.all(test_hat[:, 0] == 15.0) @@ -121,7 +123,6 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ self.assert_backward = False - import pdb; pdb.set_trace() result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() result.train_step_acc1 = result.train_step_acc1.mean() From f71c7971832d6278b20859a83e5a86e05a193f2f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:49:07 -0400 Subject: [PATCH 069/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index e4e29517b41d1..3ee49470f6918 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -62,6 +62,7 @@ def forward(self, *inputs, **kwargs): return self.module.validation_step(*inputs[0], **kwargs[0]) + import pdb; pdb.set_trace() replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) outputs = self.parallel_apply(replicas, inputs, kwargs) From 6bce9d012b44433cf54e2ed29ad08f8b6c67538e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:52:20 -0400 Subject: [PATCH 070/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 3ee49470f6918..432e8ef511d99 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -62,8 +62,9 @@ def forward(self, *inputs, **kwargs): return self.module.validation_step(*inputs[0], **kwargs[0]) - import pdb; pdb.set_trace() replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) + for replica in replicas: + replica.device = self.module.device outputs = self.parallel_apply(replicas, inputs, kwargs) if isinstance(outputs[0], Result): From e3226f32812a56c3821105900349ab785b6e6b59 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:53:54 -0400 Subject: [PATCH 071/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 432e8ef511d99..813ae2df98d20 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -63,8 +63,10 @@ def forward(self, *inputs, **kwargs): return self.module.validation_step(*inputs[0], **kwargs[0]) replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) - for replica in replicas: - replica.device = self.module.device + for replica, device_idx in zip(replicas, self.device_ids[:len(inputs)]): + replica.device = torch.device(device_idx) + # replica.to(replica.device) + outputs = self.parallel_apply(replicas, inputs, kwargs) if isinstance(outputs[0], Result): From 18630d248264bd394b5a83d37e88ba180bd1189f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:54:47 -0400 Subject: [PATCH 072/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 813ae2df98d20..d3281c1328003 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -64,8 +64,7 @@ def forward(self, *inputs, **kwargs): replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) for replica, device_idx in zip(replicas, self.device_ids[:len(inputs)]): - replica.device = torch.device(device_idx) - # replica.to(replica.device) + replica.to(torch.device(device_idx)) outputs = self.parallel_apply(replicas, inputs, kwargs) From 4a9659c17928497781577eafd7544c7a7db65ece Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:55:35 -0400 Subject: [PATCH 073/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index d3281c1328003..4aa4f50f18d5b 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -64,6 +64,7 @@ def forward(self, *inputs, **kwargs): replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) for replica, device_idx in zip(replicas, self.device_ids[:len(inputs)]): + print(device_idx) replica.to(torch.device(device_idx)) outputs = self.parallel_apply(replicas, inputs, kwargs) From 90939a8b9d86e0cacfc686330d1155bc1e1b4cee Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:57:05 -0400 Subject: [PATCH 074/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 4aa4f50f18d5b..bc267c3b545a5 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -63,9 +63,10 @@ def forward(self, *inputs, **kwargs): return self.module.validation_step(*inputs[0], **kwargs[0]) replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) - for replica, device_idx in zip(replicas, self.device_ids[:len(inputs)]): - print(device_idx) - replica.to(torch.device(device_idx)) + for replica_idx, device_idx in zip(len(replicas), self.device_ids[:len(inputs)]): + replica = replicas[replica_idx] + replica = replica.to(torch.device(device_idx)) + replicas[replica_idx] = replica outputs = self.parallel_apply(replicas, inputs, kwargs) From 33cd21b577684af85c5392940ddc6712e774bf07 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:57:47 -0400 Subject: [PATCH 075/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index bc267c3b545a5..599a4e77ae385 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -63,7 +63,7 @@ def forward(self, *inputs, **kwargs): return self.module.validation_step(*inputs[0], **kwargs[0]) replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) - for replica_idx, device_idx in zip(len(replicas), self.device_ids[:len(inputs)]): + for replica_idx, device_idx in zip(range(len(replicas)), self.device_ids[:len(inputs)]): replica = replicas[replica_idx] replica = replica.to(torch.device(device_idx)) replicas[replica_idx] = replica From 8d32a7ae6ad341fe64c67d89163404acd2ed1b5a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:58:55 -0400 Subject: [PATCH 076/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 599a4e77ae385..6765c08c9cd9b 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -63,11 +63,6 @@ def forward(self, *inputs, **kwargs): return self.module.validation_step(*inputs[0], **kwargs[0]) replicas = self.replicate(self.module, self.device_ids[:len(inputs)]) - for replica_idx, device_idx in zip(range(len(replicas)), self.device_ids[:len(inputs)]): - replica = replicas[replica_idx] - replica = replica.to(torch.device(device_idx)) - replicas[replica_idx] = replica - outputs = self.parallel_apply(replicas, inputs, kwargs) if isinstance(outputs[0], Result): @@ -188,6 +183,7 @@ def _worker(i, module, input, kwargs, device=None): if device is None: device = get_a_var(input).get_device() try: + print(device) with torch.cuda.device(device): # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): From f4a0a6f76cfcf2b37571146d48a6cf8845e1c00a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 09:59:46 -0400 Subject: [PATCH 077/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 6765c08c9cd9b..64907db71862c 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -191,6 +191,7 @@ def _worker(i, module, input, kwargs, device=None): # --------------- # CHANGE + print(module.device) if module.training: output = module.training_step(*input, **kwargs) From 78d335d47877df9276f5fd922203c41bdc787f57 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:01:00 -0400 Subject: [PATCH 078/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 64907db71862c..86cb4817f00af 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -183,27 +183,27 @@ def _worker(i, module, input, kwargs, device=None): if device is None: device = get_a_var(input).get_device() try: - print(device) - with torch.cuda.device(device): - # this also avoids accidental slicing of `input` if it is a Tensor - if not isinstance(input, (list, tuple)): - input = (input,) + module.to(device) - # --------------- - # CHANGE - print(module.device) - if module.training: - output = module.training_step(*input, **kwargs) + # this also avoids accidental slicing of `input` if it is a Tensor + if not isinstance(input, (list, tuple)): + input = (input,) - elif module.testing: - output = module.test_step(*input, **kwargs) + # --------------- + # CHANGE + print(module.device) + if module.training: + output = module.training_step(*input, **kwargs) - else: - output = module.validation_step(*input, **kwargs) + elif module.testing: + output = module.test_step(*input, **kwargs) + + else: + output = module.validation_step(*input, **kwargs) - if module.use_dp or module.use_ddp2: - auto_squeeze_dim_zeros(output) - # --------------- + if module.use_dp or module.use_ddp2: + auto_squeeze_dim_zeros(output) + # --------------- with lock: results[i] = output From 0faf91280b2d5cce99de86e7b975a45bf9d5a106 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:02:20 -0400 Subject: [PATCH 079/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 86cb4817f00af..7aa5657debb8e 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -183,7 +183,9 @@ def _worker(i, module, input, kwargs, device=None): if device is None: device = get_a_var(input).get_device() try: - module.to(device) + print('old device', module.device) + module = module.to(device) + print('new device', module.device) # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): @@ -191,7 +193,6 @@ def _worker(i, module, input, kwargs, device=None): # --------------- # CHANGE - print(module.device) if module.training: output = module.training_step(*input, **kwargs) From e0ce316e972a8fd520df64e25035c9a4fed17440 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:05:18 -0400 Subject: [PATCH 080/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 7aa5657debb8e..21e9ee9741106 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -186,6 +186,7 @@ def _worker(i, module, input, kwargs, device=None): print('old device', module.device) module = module.to(device) print('new device', module.device) + print(input) # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): From 947dc70df0e82fea4138974a1dae73016a1c8ff2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:13:11 -0400 Subject: [PATCH 081/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 35 ++++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 21e9ee9741106..00a572a173d17 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -183,29 +183,28 @@ def _worker(i, module, input, kwargs, device=None): if device is None: device = get_a_var(input).get_device() try: - print('old device', module.device) - module = module.to(device) - print('new device', module.device) - print(input) + with torch.cuda.device(device): + # this also avoids accidental slicing of `input` if it is a Tensor + if not isinstance(input, (list, tuple)): + input = (input,) - # this also avoids accidental slicing of `input` if it is a Tensor - if not isinstance(input, (list, tuple)): - input = (input,) + print(input, module.device) - # --------------- - # CHANGE - if module.training: - output = module.training_step(*input, **kwargs) + # --------------- + # CHANGE + print(module.device) + if module.training: + output = module.training_step(*input, **kwargs) - elif module.testing: - output = module.test_step(*input, **kwargs) + elif module.testing: + output = module.test_step(*input, **kwargs) - else: - output = module.validation_step(*input, **kwargs) + else: + output = module.validation_step(*input, **kwargs) - if module.use_dp or module.use_ddp2: - auto_squeeze_dim_zeros(output) - # --------------- + if module.use_dp or module.use_ddp2: + auto_squeeze_dim_zeros(output) + # --------------- with lock: results[i] = output From 1bd96c1989a9c114d1331288fd9e119af592d4fb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:14:18 -0400 Subject: [PATCH 082/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 00a572a173d17..be6603edd2cde 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,7 +188,7 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) - print(input, module.device) + print(input, module.device, module.l1.device) # --------------- # CHANGE From 886a094d3c30e721bd7444a8571b5c3c62e61479 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:15:06 -0400 Subject: [PATCH 083/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index be6603edd2cde..c50a89e989936 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,7 +188,7 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) - print(input, module.device, module.l1.device) + print(input, module.device, module.l1.weight.device) # --------------- # CHANGE From e891e4bec28294ed7049c26a53993c66b610371f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:16:38 -0400 Subject: [PATCH 084/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index c50a89e989936..5634e08658226 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -187,7 +187,7 @@ def _worker(i, module, input, kwargs, device=None): # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): input = (input,) - + module._device = device print(input, module.device, module.l1.weight.device) # --------------- From 987073920460ee4fcb3bb436865ccc5fd3609951 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:18:38 -0400 Subject: [PATCH 085/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 - tests/base/deterministic_model.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 5634e08658226..f45daad389688 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -192,7 +192,6 @@ def _worker(i, module, input, kwargs, device=None): # --------------- # CHANGE - print(module.device) if module.training: output = module.training_step(*input, **kwargs) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index b9ce1d1d3f41d..420b327ed5fbf 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -38,7 +38,7 @@ def forward(self, x): def step(self, batch, batch_idx): x = batch bs = x.size(0) - y_hat = self(x) + y_hat = self.l1(x) print(self.device) print(self.l1.weight.device) From 50ddc5afb482e3f4bbe02011be336795a891bd52 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:19:29 -0400 Subject: [PATCH 086/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index f45daad389688..5d58a3b62d973 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,7 +188,6 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) module._device = device - print(input, module.device, module.l1.weight.device) # --------------- # CHANGE From bb9dce788b32d478c595258c2a0547ed9c31ace0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:23:45 -0400 Subject: [PATCH 087/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 420b327ed5fbf..b882379add749 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -39,8 +39,7 @@ def step(self, batch, batch_idx): x = batch bs = x.size(0) y_hat = self.l1(x) - print(self.device) - print(self.l1.weight.device) + print(x.device, self.device, self.l1.weight.device) test_hat = y_hat.cpu().detach() assert torch.all(test_hat[:, 0] == 15.0) From df2b590ba2338481f1a1b312e0a3409b846e1a38 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:35:20 -0400 Subject: [PATCH 088/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 5d58a3b62d973..52d25a515e072 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,6 +188,7 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) module._device = device + module = module.to(device) # --------------- # CHANGE From 68ab1302c4455fa984e4342010c7c61c5224c200 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:36:14 -0400 Subject: [PATCH 089/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 52d25a515e072..5aabd64cbaf95 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -189,6 +189,7 @@ def _worker(i, module, input, kwargs, device=None): input = (input,) module._device = device module = module.to(device) + print(module.device, module._device, 'a') # --------------- # CHANGE From d846ff3fb92bec383e3efff349ff88a23076358b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:36:59 -0400 Subject: [PATCH 090/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 5aabd64cbaf95..b8b373f83ea1d 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -187,9 +187,9 @@ def _worker(i, module, input, kwargs, device=None): # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): input = (input,) + print(module.device, module._device, 'a') module._device = device module = module.to(device) - print(module.device, module._device, 'a') # --------------- # CHANGE From e97722a1cd14aa632d9e8e77c2ad2078da5b3467 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:38:35 -0400 Subject: [PATCH 091/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index b8b373f83ea1d..229d9133b1efd 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -187,8 +187,9 @@ def _worker(i, module, input, kwargs, device=None): # this also avoids accidental slicing of `input` if it is a Tensor if not isinstance(input, (list, tuple)): input = (input,) - print(module.device, module._device, 'a') - module._device = device + + if hasattr(module, '_device'): + module._device = device module = module.to(device) # --------------- From 36319cd17e43766ae8f85e1667e04f5d13fa186a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 10:39:08 -0400 Subject: [PATCH 092/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 229d9133b1efd..e71b487d13488 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -194,6 +194,8 @@ def _worker(i, module, input, kwargs, device=None): # --------------- # CHANGE + print(module._device, module.device) + import pdb; pdb.set_trace() if module.training: output = module.training_step(*input, **kwargs) From 06ecec5c94f18bf1695b5181a386dc1547856d9a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:14:42 -0400 Subject: [PATCH 093/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 2f4e321a1bf54..4c34ed4d09437 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -6,6 +6,7 @@ from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult +from tests.base import EvalModelTemplate # test with train_step_end @@ -480,7 +481,7 @@ def test_use_callbacks_with_train_loop_only(tmpdir): def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' - model = DeterministicModel() + model = EvalModelTemplate() model.training_step = model.training_step_full_loop_result_obj model.training_step_end = model.training_step_end_full_loop_result_obj_dp model.training_epoch_end = model.training_epoch_end_full_loop_result_obj From 3a6c132c6b7227ed9223a2f19fe2e0588357e5f1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:15:47 -0400 Subject: [PATCH 094/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index e71b487d13488..007643266da4b 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -195,7 +195,6 @@ def _worker(i, module, input, kwargs, device=None): # --------------- # CHANGE print(module._device, module.device) - import pdb; pdb.set_trace() if module.training: output = module.training_step(*input, **kwargs) From 98e11e323e4e8c0aa4a6e77bfe18349c6536e447 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:18:51 -0400 Subject: [PATCH 095/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 4c34ed4d09437..d1dde797e6bd2 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -482,9 +482,10 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() - model.training_step = model.training_step_full_loop_result_obj - model.training_step_end = model.training_step_end_full_loop_result_obj_dp - model.training_epoch_end = model.training_epoch_end_full_loop_result_obj + model2 = DeterministicModel() + model.training_step = model2.training_step_full_loop_result_obj + model.training_step_end = model2.training_step_end_full_loop_result_obj_dp + model.training_epoch_end = model2.training_epoch_end_full_loop_result_obj model.val_dataloader = None batches = 3 From 7782abe2960300e62701390ec3caba46f232f73d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:20:40 -0400 Subject: [PATCH 096/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index d1dde797e6bd2..e5d992c988a20 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -482,11 +482,14 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() + model.validation_step = None + model.test_step = None model2 = DeterministicModel() model.training_step = model2.training_step_full_loop_result_obj model.training_step_end = model2.training_step_end_full_loop_result_obj_dp model.training_epoch_end = model2.training_epoch_end_full_loop_result_obj model.val_dataloader = None + model.test_dataloader = None batches = 3 epochs = 3 From 2d4eccfce97abb9a9f9ab14ef03cdf6b6118b085 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:23:07 -0400 Subject: [PATCH 097/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index b882379add749..5e013dbfb0425 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -110,9 +110,12 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): """ Full loop flow train step """ - acc = self.step(batch, batch_idx) - result = TrainResult(minimize=acc) - result.log('train_step_acc1', acc + 1) + x, y = batch + x = x.view(x.size(0), -1) + y_hat = self(x) + loss_val = self.loss(y, y_hat) + result = TrainResult(minimize=loss_val) + result.log('train_step_acc1', loss_val + 1) self.training_step_called = True return result From 3bbd01fc40dad2130765aa973577e0b111a6b999 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:26:59 -0400 Subject: [PATCH 098/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 37 ------------------- tests/base/model_template.py | 3 ++ tests/base/model_train_steps.py | 33 +++++++++++++++++ .../test_trainer_steps_result_return.py | 7 ++-- 4 files changed, 39 insertions(+), 41 deletions(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 5e013dbfb0425..a7c5b210d55cd 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -103,43 +103,6 @@ def training_epoch_end_scalar(self, outputs): prototype_loss = outputs[0] return prototype_loss - # -------------------------- - # Result returns - # -------------------------- - def training_step_full_loop_result_obj(self, batch, batch_idx): - """ - Full loop flow train step - """ - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - loss_val = self.loss(y, y_hat) - result = TrainResult(minimize=loss_val) - result.log('train_step_acc1', loss_val + 1) - self.training_step_called = True - return result - - def training_step_end_full_loop_result_obj_dp(self, result): - """ - Full loop flow train step - """ - self.assert_backward = False - - result.minimize = result.minimize.mean() - result.checkpoint_on = result.checkpoint_on.mean() - result.train_step_acc1 = result.train_step_acc1.mean() - result.log('train_step_end_acc1', 1) - self.training_step_end_called = True - return result - - def training_epoch_end_full_loop_result_obj(self, result): - """ - Full loop flow train step - """ - result.log('train_epoch_end_acc1', 1) - self.training_epoch_end_called = True - return result - def training_step_no_default_callbacks_for_train_loop(self, batch, batch_idx): """ Early stop and checkpoint only on these values diff --git a/tests/base/model_template.py b/tests/base/model_template.py index 48851cdb08219..a89769e6f487b 100644 --- a/tests/base/model_template.py +++ b/tests/base/model_template.py @@ -63,6 +63,9 @@ def __init__( self.hidden_dim = hidden_dim self.b1 = b1 self.b2 = b2 + self.training_step_called = False + self.training_step_end_called = False + self.training_epoch_end_called = False # if you specify an example input, the summary will show input/output for each layer # TODO: to be fixed in #1773 diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index fcd020d852126..e86b6e4f9f0af 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -1,6 +1,7 @@ import math from abc import ABC from collections import OrderedDict +from pytorch_lightning import TrainResult import torch @@ -38,3 +39,35 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): else: output /= 0 return output + + def training_step_full_loop_result_obj(self, batch, batch_idx): + """ + Full loop flow train step + """ + x, y = batch + x = x.view(x.size(0), -1) + y_hat = self(x) + loss_val = self.loss(y, y_hat) + result = TrainResult(minimize=loss_val) + result.log('train_step_acc1', loss_val + 1) + self.training_step_called = True + return result + + def training_step_end_full_loop_result_obj_dp(self, result): + """ + Full loop flow train step + """ + result.minimize = result.minimize.mean() + result.checkpoint_on = result.checkpoint_on.mean() + result.train_step_acc1 = result.train_step_acc1.mean() + result.log('train_step_end_acc1', 1) + self.training_step_end_called = True + return result + + def training_epoch_end_full_loop_result_obj(self, result): + """ + Full loop flow train step + """ + result.log('train_epoch_end_acc1', 1) + self.training_epoch_end_called = True + return result diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index e5d992c988a20..13a14492a0a4f 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -484,10 +484,9 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): model = EvalModelTemplate() model.validation_step = None model.test_step = None - model2 = DeterministicModel() - model.training_step = model2.training_step_full_loop_result_obj - model.training_step_end = model2.training_step_end_full_loop_result_obj_dp - model.training_epoch_end = model2.training_epoch_end_full_loop_result_obj + model.training_step = model.training_step_full_loop_result_obj + model.training_step_end = model.training_step_end_full_loop_result_obj_dp + model.training_epoch_end = model.training_epoch_end_full_loop_result_obj model.val_dataloader = None model.test_dataloader = None From 77c28b080fa8c060ea38f822d3b214a4d96dd05f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:29:31 -0400 Subject: [PATCH 099/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 1 - tests/base/model_train_steps.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 007643266da4b..229d9133b1efd 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -194,7 +194,6 @@ def _worker(i, module, input, kwargs, device=None): # --------------- # CHANGE - print(module._device, module.device) if module.training: output = module.training_step(*input, **kwargs) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index e86b6e4f9f0af..62fe11f4b2750 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -46,6 +46,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): """ x, y = batch x = x.view(x.size(0), -1) + print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) loss_val = self.loss(y, y_hat) result = TrainResult(minimize=loss_val) From 0f180731479ef683aebaabfb51402750d893ffed Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:32:29 -0400 Subject: [PATCH 100/168] finished tests for structured results on train epoch --- pytorch_lightning/trainer/distrib_parts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 20edc0d60541a..83bb1b8875902 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -232,7 +232,7 @@ def dp_train(self, model): if self.is_function_implemented('setup', model): model.setup('fit') - model.cuda(self.root_gpu) + # model.cuda(self.root_gpu) # CHOOSE OPTIMIZER # allow for lr schedulers as well From a0dd29bb0b8b1772d1b8fdca3d17be9191453b58 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:33:07 -0400 Subject: [PATCH 101/168] finished tests for structured results on train epoch --- pytorch_lightning/trainer/distrib_parts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 83bb1b8875902..94c9f83328bfc 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -232,7 +232,7 @@ def dp_train(self, model): if self.is_function_implemented('setup', model): model.setup('fit') - # model.cuda(self.root_gpu) + model.cuda(self.root_gpu) # CHOOSE OPTIMIZER # allow for lr schedulers as well @@ -263,7 +263,7 @@ def dp_train(self, model): device_ids = list(range(device_ids)) # set dp device - torch.cuda.set_device(self.root_gpu) + # torch.cuda.set_device(self.root_gpu) model = LightningDataParallel(model, device_ids=device_ids) From 36c10b59fd114d142dbbe217d97eb2f2fb8b72bf Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:33:52 -0400 Subject: [PATCH 102/168] finished tests for structured results on train epoch --- pytorch_lightning/trainer/distrib_parts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 94c9f83328bfc..47a63207babc0 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -264,6 +264,7 @@ def dp_train(self, model): # set dp device # torch.cuda.set_device(self.root_gpu) + print(device_ids) model = LightningDataParallel(model, device_ids=device_ids) From e113e2c1e34defbaa2ee8089817750cc00bd8101 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:36:42 -0400 Subject: [PATCH 103/168] finished tests for structured results on train epoch --- pytorch_lightning/trainer/distrib_parts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 47a63207babc0..20edc0d60541a 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -263,8 +263,7 @@ def dp_train(self, model): device_ids = list(range(device_ids)) # set dp device - # torch.cuda.set_device(self.root_gpu) - print(device_ids) + torch.cuda.set_device(self.root_gpu) model = LightningDataParallel(model, device_ids=device_ids) From f7d2841a2dbdf44e16be0a72de57a5601cde7f42 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:37:16 -0400 Subject: [PATCH 104/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 62fe11f4b2750..72b1bc1d8fdf0 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -48,7 +48,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): x = x.view(x.size(0), -1) print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) - loss_val = self.loss(y, y_hat) + loss_val = self.loss(y.type_as(y_hat), y_hat) result = TrainResult(minimize=loss_val) result.log('train_step_acc1', loss_val + 1) self.training_step_called = True From 94ea112f5a2b4e2fa5ac07679868c0b992d587a8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:38:17 -0400 Subject: [PATCH 105/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 72b1bc1d8fdf0..3652a5f937aa5 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -48,7 +48,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): x = x.view(x.size(0), -1) print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) - loss_val = self.loss(y.type_as(y_hat), y_hat) + loss_val = self.loss(y.type_as(y_hat), y_hat.long()) result = TrainResult(minimize=loss_val) result.log('train_step_acc1', loss_val + 1) self.training_step_called = True From f5b4259310f21f2e850bedfbf161f203f879d12e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:39:52 -0400 Subject: [PATCH 106/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 3652a5f937aa5..ea86ac108cb09 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -48,7 +48,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): x = x.view(x.size(0), -1) print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) - loss_val = self.loss(y.type_as(y_hat), y_hat.long()) + loss_val = self.loss(y.type_as(y_hat).float(), y_hat.float()) result = TrainResult(minimize=loss_val) result.log('train_step_acc1', loss_val + 1) self.training_step_called = True From b0f6590ce5389658a13ee8bee4f3de472900bf6c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:40:40 -0400 Subject: [PATCH 107/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index ea86ac108cb09..e1de8f39e8fa3 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -48,7 +48,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): x = x.view(x.size(0), -1) print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) - loss_val = self.loss(y.type_as(y_hat).float(), y_hat.float()) + loss_val = self.loss(y.type_as(y_hat).float(), y_hat.long()) result = TrainResult(minimize=loss_val) result.log('train_step_acc1', loss_val + 1) self.training_step_called = True From 5e1882be0eb770128c5a40347724488309f771d4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:43:38 -0400 Subject: [PATCH 108/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index e1de8f39e8fa3..f913b9ddb3777 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -48,7 +48,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): x = x.view(x.size(0), -1) print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) - loss_val = self.loss(y.type_as(y_hat).float(), y_hat.long()) + loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) result.log('train_step_acc1', loss_val + 1) self.training_step_called = True From 96f968976f7770f10ea2423703bbc11ee1d6ebbb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 19 Jul 2020 11:44:28 -0400 Subject: [PATCH 109/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index f913b9ddb3777..d59e2e045ee02 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -46,7 +46,6 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): """ x, y = batch x = x.view(x.size(0), -1) - print(self.device, self.c_d1.weight.device, x.device) y_hat = self(x) loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) From 5cd90fe8853a30118fa841c755e02c4d6a3808c9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 07:53:54 -0400 Subject: [PATCH 110/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 229d9133b1efd..86ebf00f6c24e 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,9 +188,9 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) - if hasattr(module, '_device'): - module._device = device - module = module.to(device) + # if hasattr(module, '_device'): + # module._device = device + # module = module.to(device) # --------------- # CHANGE From 4ebd847877f8a6712065b81dcc080be28cbe7c0c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 07:54:48 -0400 Subject: [PATCH 111/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 86ebf00f6c24e..e4e29517b41d1 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,10 +188,6 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) - # if hasattr(module, '_device'): - # module._device = device - # module = module.to(device) - # --------------- # CHANGE if module.training: From 4c3b03aef0140c5515d94ffaaec371cdba764d8c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 07:56:03 -0400 Subject: [PATCH 112/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index d59e2e045ee02..f038e868c62fb 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -49,7 +49,7 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): y_hat = self(x) loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) - result.log('train_step_acc1', loss_val + 1) + result.log('train_step_test', loss_val + 1) self.training_step_called = True return result @@ -57,6 +57,7 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step """ + import pdb; pdb.set_trace() result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() result.train_step_acc1 = result.train_step_acc1.mean() From 7886bcb9cc0fa714ad77401206874cbf04833e98 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 07:59:19 -0400 Subject: [PATCH 113/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 11 +++++------ tests/trainer/test_trainer_steps_result_return.py | 6 ++++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index f038e868c62fb..378fbfd450160 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -40,9 +40,9 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): output /= 0 return output - def training_step_full_loop_result_obj(self, batch, batch_idx): + def training_step_full_loop_result_obj_dp(self, batch, batch_idx): """ - Full loop flow train step + Full loop flow train step (result obj + dp) """ x, y = batch x = x.view(x.size(0), -1) @@ -55,9 +55,8 @@ def training_step_full_loop_result_obj(self, batch, batch_idx): def training_step_end_full_loop_result_obj_dp(self, result): """ - Full loop flow train step + Full loop flow train step (result obj + dp) """ - import pdb; pdb.set_trace() result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() result.train_step_acc1 = result.train_step_acc1.mean() @@ -65,9 +64,9 @@ def training_step_end_full_loop_result_obj_dp(self, result): self.training_step_end_called = True return result - def training_epoch_end_full_loop_result_obj(self, result): + def training_epoch_end_full_loop_result_obj_dp(self, result): """ - Full loop flow train step + Full loop flow train step (result obj + dp) """ result.log('train_epoch_end_acc1', 1) self.training_epoch_end_called = True diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 13a14492a0a4f..daeea1d6c3cff 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -7,6 +7,7 @@ from tests.base.deterministic_model import DeterministicModel from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult from tests.base import EvalModelTemplate +import pytest # test with train_step_end @@ -478,15 +479,16 @@ def test_use_callbacks_with_train_loop_only(tmpdir): assert ckpt_val['monitor'] == 'checkpoint_on' +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() model.validation_step = None model.test_step = None - model.training_step = model.training_step_full_loop_result_obj + model.training_step = model.training_step_full_loop_result_obj_dp model.training_step_end = model.training_step_end_full_loop_result_obj_dp - model.training_epoch_end = model.training_epoch_end_full_loop_result_obj + model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp model.val_dataloader = None model.test_dataloader = None From 3c2f53cae77b847e39954b3552533e25762537ae Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:01:11 -0400 Subject: [PATCH 114/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 378fbfd450160..1c06c57708464 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -49,7 +49,7 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx): y_hat = self(x) loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) - result.log('train_step_test', loss_val + 1) + result.log('train_step_metric', loss_val + 1) self.training_step_called = True return result @@ -59,8 +59,8 @@ def training_step_end_full_loop_result_obj_dp(self, result): """ result.minimize = result.minimize.mean() result.checkpoint_on = result.checkpoint_on.mean() - result.train_step_acc1 = result.train_step_acc1.mean() - result.log('train_step_end_acc1', 1) + result.train_step_metric = result.train_step_metric.mean() + result.log('train_step_end_metric', 1) self.training_step_end_called = True return result @@ -68,6 +68,6 @@ def training_epoch_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step (result obj + dp) """ - result.log('train_epoch_end_acc1', 1) + result.log('train_epoch_end_metric', 1) self.training_epoch_end_called = True return result From 0f3807f91f0efb3935710f235bb0994b0fc33fd7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:22:47 -0400 Subject: [PATCH 115/168] finished tests for structured results on train epoch --- pytorch_lightning/utilities/debugging.py | 1 + .../test_trainer_steps_result_return.py | 28 +------------------ 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index d8a7722fd8884..47f98ac4685e8 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -15,6 +15,7 @@ def __init__(self, trainer): def track_logged_metrics_history(self, scalar_metrics): if self.enabled: + scalar_metrics['global_step'] = self.trainer.global_step self.logged_metrics.append(scalar_metrics) def track_train_loss_history(self, batch_idx, loss): diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index daeea1d6c3cff..0f25290279076 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -500,7 +500,7 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): gpus=2, max_epochs=epochs, early_stop_callback=True, - row_log_interval=1, + row_log_interval=2, limit_train_batches=batches, weights_summary=None, ) @@ -508,31 +508,5 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer.fit(model) import pdb; pdb.set_trace() - num_expected_epochs = 10 - - # ---------------------------------- - # VERIFY EARLY STOPPING BEHAVIOR - # ---------------------------------- - # with train loop only it happens on every epoch - early_stop_vals = trainer.dev_debugger.early_stopping_history - assert len(early_stop_vals) == num_expected_epochs - min_val = min([x['best'] for x in early_stop_vals]) - assert min_val == 171 + 9 - all_losses = trainer.dev_debugger.saved_losses - - from collections import Counter - batch_idxs = Counter([x['batch_idx'] for x in all_losses]) - for i, val in batch_idxs.items(): - assert val == num_expected_epochs - assert i in [0, 1, 2] - - # ---------------------------------- - # VERIFY CHECKPOINTING BEHAVIOR - # ---------------------------------- - ckpt_vals = trainer.dev_debugger.checkpoint_callback_history - assert len(ckpt_vals) == 5, '5 ckpts should have been saved' - for ckpt_val, expected_epoch in zip(ckpt_vals, [0, 1, 2, 3, 6]): - assert ckpt_val['epoch'] == expected_epoch - assert ckpt_val['monitor'] == 'checkpoint_on' test_full_train_loop_with_results_obj_dp('') \ No newline at end of file From 14db086093d35a1e4e0b78320de7d2d9c8c560c9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:25:59 -0400 Subject: [PATCH 116/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 0f25290279076..911c240b72d5d 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -508,5 +508,10 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer.fit(model) import pdb; pdb.set_trace() + i = 0 + for metric in trainer.dev_debugger.logged_metrics: + assert metric['global_step'] == i + i += trainer.row_log_interval + test_full_train_loop_with_results_obj_dp('') \ No newline at end of file From 7452cd5684f146cad085cfa039a5477892c5acdd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:27:46 -0400 Subject: [PATCH 117/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 1c06c57708464..3ed06a0427444 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -68,6 +68,6 @@ def training_epoch_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step (result obj + dp) """ - result.log('train_epoch_end_metric', 1) + result.log('train_epoch_end_metric', 1, on_epoch=True) self.training_epoch_end_called = True return result From 21ffdf2dc2e567270101d78ec31be61d25c5eb35 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:28:57 -0400 Subject: [PATCH 118/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 911c240b72d5d..4a071eff80ebb 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -506,8 +506,13 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): ) trainer.fit(model) - import pdb; pdb.set_trace() + # make sure the loop was good + assert model.training_step_called + assert model.training_step_end_called + assert model.training_epoch_end_called + + # make sure we have the correct metrics logged i = 0 for metric in trainer.dev_debugger.logged_metrics: assert metric['global_step'] == i From ee31889bc52bfb5b004cdd6edd7ce13ad0b533dd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:31:11 -0400 Subject: [PATCH 119/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 4a071eff80ebb..f974eb8ed57ff 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -512,11 +512,15 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): assert model.training_step_end_called assert model.training_epoch_end_called - # make sure we have the correct metrics logged - i = 0 + # make sure we saw all the correct keys + seen_keys = set() for metric in trainer.dev_debugger.logged_metrics: - assert metric['global_step'] == i - i += trainer.row_log_interval + seen_keys.update(metric.keys()) + + assert 'train_step_metric' in seen_keys + assert 'train_step_end_metric' in seen_keys + assert 'train_epoch_end_metric' in seen_keys + test_full_train_loop_with_results_obj_dp('') \ No newline at end of file From 59428214efd7e4b162b55a4d9bb6fc10dc816998 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:31:24 -0400 Subject: [PATCH 120/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index f974eb8ed57ff..762a9c33ea536 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -520,7 +520,3 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): assert 'train_step_metric' in seen_keys assert 'train_step_end_metric' in seen_keys assert 'train_epoch_end_metric' in seen_keys - - - -test_full_train_loop_with_results_obj_dp('') \ No newline at end of file From 2692014fc43bb8d53af02d6492f09f85b832a653 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:44:06 -0400 Subject: [PATCH 121/168] finished tests for structured results on train epoch --- tests/base/deterministic_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index a7c5b210d55cd..88651d8ace301 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -333,7 +333,6 @@ def backward(self, trainer, loss, optimizer, optimizer_idx): else: assert loss == 171.0 - import pdb; pdb.set_trace() loss.backward() From 042bcb6cb3d69da1c28fe6faab080baf5610ffb8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:46:00 -0400 Subject: [PATCH 122/168] finished tests for structured results on train epoch --- pytorch_lightning/callbacks/early_stopping.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 7035159e990ff..308408642d159 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -144,12 +144,14 @@ def on_validation_end(self, trainer, pl_module): def on_train_epoch_end(self, trainer, pl_module): # early stopping can also work in the train loop when there is no val loop and when using structured results should_check_early_stop = False - if 'early_stop_on' in trainer.callback_metrics and trainer.callback_metrics['early_stop_on'] is not None: + train_es_key = 'early_stop_on' + if train_es_key in trainer.callback_metrics and trainer.callback_metrics[train_es_key] is not None: self.monitor = 'early_stop_on' should_check_early_stop = True - if 'val_early_stop_on' in trainer.callback_metrics and trainer.callback_metrics['val_early_stop_on'] is not None: - self.monitor = 'val_early_stop_on' + val_es_key = 'val_early_stop_on' + if val_es_key in trainer.callback_metrics and trainer.callback_metrics[val_es_key] is not None: + self.monitor = val_es_key should_check_early_stop = True if should_check_early_stop: From 8a449e654c70a1c58581c7a389388bcdfe33cc40 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 08:47:18 -0400 Subject: [PATCH 123/168] finished tests for structured results on train epoch --- pytorch_lightning/core/step_result.py | 2 +- pytorch_lightning/utilities/debugging.py | 4 ++-- tests/base/deterministic_model.py | 27 ++++++++++++++++-------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index f3ca6b970cd22..b3a76ebfcc412 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -324,4 +324,4 @@ def log( result.hiddens = torch.tensor(1) result.log('some', 123) print(result) - result.minimize = torch.tensor(1) \ No newline at end of file + result.minimize = torch.tensor(1) diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index 47f98ac4685e8..490356938fb6d 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -4,7 +4,7 @@ class InternalDebugger(object): def __init__(self, trainer): - + self.enabled = 'PL_DEV_DEBUG' in os.environ self.trainer = trainer self.logged_metrics = [] @@ -51,4 +51,4 @@ def track_checkpointing_history(self, filepath): 'rank': self.trainer.global_rank, 'filepath': filepath } - self.checkpoint_callback_history.append(debug_dict) \ No newline at end of file + self.checkpoint_callback_history.append(debug_dict) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index 88651d8ace301..dc4123b978921 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -152,9 +152,12 @@ def training_step_result_log_epoch_only(self, batch, batch_idx): acc = self.step(batch, batch_idx) result = TrainResult(minimize=acc) - result.log(f'epoch_log_and_pbar_acc1_e{self.current_epoch}', torch.tensor(14).type_as(acc), on_epoch=True, prog_bar=True, on_step=False) - result.log(f'epoch_log_acc2_e{self.current_epoch}', torch.tensor(15).type_as(acc), on_epoch=True, on_step=False) - result.log(f'epoch_pbar_acc3_e{self.current_epoch}', torch.tensor(16).type_as(acc), on_epoch=True, logger=False, prog_bar=True, on_step=False) + result.log(f'epoch_log_and_pbar_acc1_e{self.current_epoch}', torch.tensor(14).type_as(acc), + on_epoch=True, prog_bar=True, on_step=False) + result.log(f'epoch_log_acc2_e{self.current_epoch}', torch.tensor(15).type_as(acc), + on_epoch=True, on_step=False) + result.log(f'epoch_pbar_acc3_e{self.current_epoch}', torch.tensor(16).type_as(acc), + on_epoch=True, logger=False, prog_bar=True, on_step=False) self.training_step_called = True return result @@ -166,9 +169,12 @@ def training_step_result_log_epoch_and_step(self, batch, batch_idx): val_1 = (5 + batch_idx) * (self.current_epoch + 1) val_2 = (6 + batch_idx) * (self.current_epoch + 1) val_3 = (7 + batch_idx) * (self.current_epoch + 1) - result.log(f'step_epoch_log_and_pbar_acc1', torch.tensor(val_1).type_as(acc), on_epoch=True, prog_bar=True) - result.log(f'step_epoch_log_acc2', torch.tensor(val_2).type_as(acc), on_epoch=True) - result.log(f'step_epoch_pbar_acc3', torch.tensor(val_3).type_as(acc), on_epoch=True, logger=False, prog_bar=True) + result.log(f'step_epoch_log_and_pbar_acc1', torch.tensor(val_1).type_as(acc), + on_epoch=True, prog_bar=True) + result.log(f'step_epoch_log_acc2', torch.tensor(val_2).type_as(acc), + on_epoch=True) + result.log(f'step_epoch_pbar_acc3', torch.tensor(val_3).type_as(acc), + on_epoch=True, logger=False, prog_bar=True) self.training_step_called = True return result @@ -188,9 +194,12 @@ def training_epoch_end_return_for_log_epoch_and_step(self, result): result.step_epoch_log_and_pbar_acc1 = result.step_epoch_log_and_pbar_acc1.prod() result.step_epoch_log_acc2 = result.step_epoch_log_acc2.prod() result.step_epoch_pbar_acc3 = result.step_epoch_pbar_acc3.prod() - result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.step_epoch_log_acc2), logger=True, on_epoch=True) - result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.step_epoch_log_acc2), logger=False, prog_bar=True, on_epoch=True) - result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.step_epoch_log_acc2), logger=True, prog_bar=True, on_epoch=True) + result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.step_epoch_log_acc2), + logger=True, on_epoch=True) + result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.step_epoch_log_acc2), + logger=False, prog_bar=True, on_epoch=True) + result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.step_epoch_log_acc2), + logger=True, prog_bar=True, on_epoch=True) return result # -------------------------- From e7d158569ce31ecea61ace607803b0cf195b5e92 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 20 Jul 2020 15:18:17 +0200 Subject: [PATCH 124/168] cache --- .github/workflows/ci-testing.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index dea7396125c7e..7c3095fc281a1 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -82,9 +82,9 @@ jobs: uses: actions/cache@v1 with: path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/extra.txt') }} + key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/extra.txt') }} restore-keys: | - ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip- + ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.requires }}-pip- - name: Install dependencies run: | From 1d34947d0a179ed901dabdc6e2e6d88fa314e5d8 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 09:22:35 -0400 Subject: [PATCH 125/168] finished tests for structured results on train epoch --- tests/callbacks/test_model_checkpoint.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py index 1091a4cf3a8dd..da3c908a1e11b 100644 --- a/tests/callbacks/test_model_checkpoint.py +++ b/tests/callbacks/test_model_checkpoint.py @@ -78,11 +78,11 @@ def __init__(self, expected_count, *args, **kwargs): self.count = 0 self.expected_count = expected_count - def _save_model(self, filepath): + def _save_model(self, filepath, trainer, pl_module): # make sure we don't save twice assert not os.path.isfile(filepath) self.count += 1 - super()._save_model(filepath) + super()._save_model(filepath, trainer, pl_module) def on_train_end(self, trainer, pl_module): super().on_train_end(trainer, pl_module) @@ -107,3 +107,6 @@ def test_model_checkpoint_no_extraneous_invocations(tmpdir): ) result = trainer.fit(model) assert 1 == result + +if __name__ == '__main__': + test_model_checkpoint_no_extraneous_invocations('') \ No newline at end of file From bfde914558ccfdce85c57dc1cd711b9bb8f2d7de Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 09:22:50 -0400 Subject: [PATCH 126/168] finished tests for structured results on train epoch --- tests/callbacks/test_model_checkpoint.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py index da3c908a1e11b..7257dc3874a2a 100644 --- a/tests/callbacks/test_model_checkpoint.py +++ b/tests/callbacks/test_model_checkpoint.py @@ -107,6 +107,3 @@ def test_model_checkpoint_no_extraneous_invocations(tmpdir): ) result = trainer.fit(model) assert 1 == result - -if __name__ == '__main__': - test_model_checkpoint_no_extraneous_invocations('') \ No newline at end of file From 2e7b68dd572533b2368076aec0f7308b07cffde9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 09:45:25 -0400 Subject: [PATCH 127/168] finished tests for structured results on train epoch --- pytorch_lightning/trainer/training_loop.py | 5 ++- tests/models/test_grad_norm.py | 44 ++++------------------ 2 files changed, 10 insertions(+), 39 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index bca2ec8dbb943..5ee9803f390cd 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -635,8 +635,9 @@ def save_train_loop_metrics_to_loggers(self, batch_idx, batch_output): if should_log_metrics or self.fast_dev_run: # logs user requested information to logger metrics = batch_output.batch_log_metrics - if len(metrics) > 0: - self.log_metrics(metrics, batch_output.grad_norm_dic) + grad_norm_dic = batch_output.grad_norm_dic + if len(metrics) > 0 or len(grad_norm_dic) > 0: + self.log_metrics(metrics, grad_norm_dic) def save_loggers_in_training_loop(self, batch_idx): # when loggers should save to disk diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index ff627c5088987..d7978965a3cfe 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -1,43 +1,12 @@ import numpy as np import pytest +import os from pytorch_lightning import Trainer -from pytorch_lightning.loggers import LightningLoggerBase -from pytorch_lightning.utilities import rank_zero_only from tests.base import EvalModelTemplate from tests.base.develop_utils import reset_seed -class OnlyMetricsListLogger(LightningLoggerBase): - def __init__(self): - super().__init__() - self.metrics = [] - - @rank_zero_only - def log_metrics(self, metrics, step): - self.metrics.append(metrics) - - @property - def experiment(self): - return 'test' - - @rank_zero_only - def log_hyperparams(self, params): - pass - - @rank_zero_only - def finalize(self, status): - pass - - @property - def name(self): - return 'name' - - @property - def version(self): - return '1' - - class ModelWithManualGradTracker(EvalModelTemplate): def __init__(self, norm_type, *args, **kwargs): super().__init__(*args, **kwargs) @@ -75,28 +44,29 @@ def on_after_backward(self): @pytest.mark.parametrize("norm_type", [1., 1.25, 1.5, 2, 3, 5, 10, 'inf']) def test_grad_tracking(tmpdir, norm_type, rtol=5e-3): - # rtol=5e-3 respects the 3 decmials rounding in `.grad_norms` and above + os.environ['PL_DEV_DEBUG'] = '1' + + # rtol=5e-3 respects the 3 decimals rounding in `.grad_norms` and above reset_seed() # use a custom grad tracking module and a list logger model = ModelWithManualGradTracker(norm_type) - logger = OnlyMetricsListLogger() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, - logger=logger, track_grad_norm=norm_type, row_log_interval=1, # request grad_norms every batch ) result = trainer.fit(model) assert result == 1, "Training failed" - assert len(logger.metrics) == len(model.stored_grad_norms) + logged_metrics = trainer.dev_debugger.logged_metrics + assert len(logged_metrics) == len(model.stored_grad_norms) # compare the logged metrics against tracked norms on `.backward` - for mod, log in zip(model.stored_grad_norms, logger.metrics): + for mod, log in zip(model.stored_grad_norms, logged_metrics): common = mod.keys() & log.keys() log, mod = [log[k] for k in common], [mod[k] for k in common] From 71712d89313afef80946e1e2e156a7bf3cec20b5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 09:57:27 -0400 Subject: [PATCH 128/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 762a9c33ea536..a0ddeacdb9c99 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -51,7 +51,7 @@ def test_training_step_result_log_step_only(tmpdir): assert logged_metrics[f'step_log_and_pbar_acc1_b{batch_idx}'] == 11.0 assert logged_metrics[f'step_log_acc2_b{batch_idx}'] == 12.0 assert f'step_pbar_acc3_b{batch_idx}' not in logged_metrics - assert len(logged_metrics) == 3 + assert len(logged_metrics) == 4 # make sure we are using the correct metrics for callbacks assert trainer.callback_metrics['checkpoint_on'] == 171 @@ -124,7 +124,7 @@ def test_training_step_result_log_epoch_only(tmpdir): assert logged_metrics[f'epoch_log_and_pbar_acc1_e{batch_idx}'] == 14.0 assert logged_metrics[f'epoch_log_acc2_e{batch_idx}'] == 15.0 assert f'epoch_pbar_acc3_e{batch_idx}' not in logged_metrics - assert len(logged_metrics) == 3 + assert len(logged_metrics) == 4 # make sure we are using the correct metrics for callbacks assert trainer.callback_metrics['checkpoint_on'] == 171 @@ -211,7 +211,7 @@ def test_training_step_result_log_step_and_epoch(tmpdir): assert logged_metrics['step_epoch_log_and_pbar_acc1'] == expected_val_1 assert logged_metrics['step_epoch_log_acc2'] == expected_val_2 assert 'step_epoch_pbar_acc3' not in logged_metrics - assert len(logged_metrics) == 3 + assert len(logged_metrics) == 4 # make sure the metrics for the epoch end are actual means (the default reduce fx) or all the batches epoch_end_metrics = epoch_outputs[-1] @@ -220,7 +220,7 @@ def test_training_step_result_log_step_and_epoch(tmpdir): assert epoch_end_metrics['step_epoch_log_and_pbar_acc1'] == eval_1 assert epoch_end_metrics['step_epoch_log_acc2'] == eval_2 assert 'step_epoch_pbar_acc3' not in epoch_end_metrics - assert len(logged_metrics) == 3 + assert len(logged_metrics) == 4 # make sure we are using the correct metrics for callbacks assert trainer.callback_metrics['checkpoint_on'] == 171 From d93845e14bbfef607acc399dbe7428737310c9e1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 10:14:23 -0400 Subject: [PATCH 129/168] finished tests for structured results on train epoch --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5ee9803f390cd..0caf9f22b5108 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -472,10 +472,10 @@ def run_training_epoch(self): # otherwise we will build up unnecessary memory step_out = batch_output.training_step_output_for_epoch_end should_auto_reduce_train_result = isinstance(step_out, Result) and step_out.should_reduce_on_epoch_end - if 'early_stop_on' in step_out: + if isinstance(step_out, dict) and 'early_stop_on' in step_out: early_stopping_accumulator.accumulate(step_out['early_stop_on']) - if 'checkpoint_on' in step_out: + if isinstance(step_out, dict) and 'checkpoint_on' in step_out: checkpoint_accumulator.accumulate(step_out['checkpoint_on']) if self.is_overridden('training_epoch_end', model=self.get_model()) or should_auto_reduce_train_result: From 1ec899227f886b8dd9e1442d96e41cdd7f1d2961 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 10:14:56 -0400 Subject: [PATCH 130/168] Update pytorch_lightning/callbacks/early_stopping.py Co-authored-by: Jirka Borovec --- pytorch_lightning/callbacks/early_stopping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 308408642d159..39d3615186303 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -150,7 +150,7 @@ def on_train_epoch_end(self, trainer, pl_module): should_check_early_stop = True val_es_key = 'val_early_stop_on' - if val_es_key in trainer.callback_metrics and trainer.callback_metrics[val_es_key] is not None: + if trainer.callback_metrics.get(val_es_key, None) is not None: self.monitor = val_es_key should_check_early_stop = True From e272a59d452c11ca5af8b3ef8d81499bf1cc2011 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 10:15:21 -0400 Subject: [PATCH 131/168] Update pytorch_lightning/callbacks/early_stopping.py Co-authored-by: Jirka Borovec --- pytorch_lightning/callbacks/early_stopping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 39d3615186303..4f7bb572e6bf5 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -146,7 +146,7 @@ def on_train_epoch_end(self, trainer, pl_module): should_check_early_stop = False train_es_key = 'early_stop_on' if train_es_key in trainer.callback_metrics and trainer.callback_metrics[train_es_key] is not None: - self.monitor = 'early_stop_on' + self.monitor = train_es_key should_check_early_stop = True val_es_key = 'val_early_stop_on' From 6c8f2e5a7916b5196fed6cbbdeb0e9e2f3dd5394 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 10:16:07 -0400 Subject: [PATCH 132/168] Update pytorch_lightning/callbacks/early_stopping.py Co-authored-by: Jirka Borovec --- pytorch_lightning/callbacks/early_stopping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 4f7bb572e6bf5..4e22cba977198 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -145,7 +145,7 @@ def on_train_epoch_end(self, trainer, pl_module): # early stopping can also work in the train loop when there is no val loop and when using structured results should_check_early_stop = False train_es_key = 'early_stop_on' - if train_es_key in trainer.callback_metrics and trainer.callback_metrics[train_es_key] is not None: + if trainer.callback_metrics.get(train_es_key, None) is not None: self.monitor = train_es_key should_check_early_stop = True From 4ce032fba98e9840153de6fb7bcca4d31f2dc78a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 20 Jul 2020 16:32:54 +0200 Subject: [PATCH 133/168] Update pytorch_lightning/callbacks/model_checkpoint.py --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index eb81e879c7d56..370a30b75dc4d 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -276,7 +276,7 @@ def on_validation_end(self, trainer, pl_module): epoch = trainer.current_epoch # support structured results - if 'checkpoint_on' in metrics and metrics['checkpoint_on'] is not None: + if metrics.get('checkpoint_on') is not None: self.monitor = 'checkpoint_on' if self.save_top_k == 0: From b7ea0ccaac3d8a13441fe170471cd36e77d03154 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 20 Jul 2020 16:33:49 +0200 Subject: [PATCH 134/168] Update pytorch_lightning/core/step_result.py --- pytorch_lightning/core/step_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index b3a76ebfcc412..f5b22a5585574 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -9,7 +9,7 @@ class Result(Dict): def __init__( self, minimize: Optional[Tensor] = None, - early_stop_on: Tensor = None, + early_stop_on: Optional[Tensor] = None, checkpoint_on: Union[Tensor, bool] = None, hiddens: Optional[Tensor] = None ): From 3e7af00506cda4c10376a917676d57e1a40f0ebb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 10:33:53 -0400 Subject: [PATCH 135/168] finished tests for structured results on train epoch --- pytorch_lightning/overrides/data_parallel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index e4e29517b41d1..229d9133b1efd 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,6 +188,10 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) + if hasattr(module, '_device'): + module._device = device + module = module.to(device) + # --------------- # CHANGE if module.training: From b4ad5c2d5c2a23f4d9ab381b4592f336eb74eda0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 10:49:19 -0400 Subject: [PATCH 136/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 3ed06a0427444..5a30c7f2874b2 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -46,7 +46,7 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx): """ x, y = batch x = x.view(x.size(0), -1) - y_hat = self(x) + y_hat = self.forward(x) loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) result.log('train_step_metric', loss_val + 1) From a2c2401c0118b74afce0c1793415032fc27c5229 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 20 Jul 2020 17:27:55 +0200 Subject: [PATCH 137/168] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/core/hooks.py | 2 +- pytorch_lightning/core/step_result.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 60e93aa275d93..f2762e39cced0 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -129,7 +129,7 @@ def on_train_epoch_end(self) -> None: def on_val_epoch_start(self) -> None: """ - Called in the training loop at the very beginning of the epoch. + Called in the validation loop at the very beginning of the epoch. """ # do something when the epoch starts diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index f5b22a5585574..6cc044ef1c220 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -10,7 +10,7 @@ def __init__( self, minimize: Optional[Tensor] = None, early_stop_on: Optional[Tensor] = None, - checkpoint_on: Union[Tensor, bool] = None, + checkpoint_on: Union[Tensor, bool, None] = None, hiddens: Optional[Tensor] = None ): @@ -36,7 +36,7 @@ def __init__( } } - def __getattr__(self, key): + def __getattr__(self, key: str) -> Any: try: if key == 'callback_metrics': return self.get_callback_metrics() @@ -53,7 +53,7 @@ def __getattr__(self, key): except KeyError: return None - def __setattr__(self, key, val): + def __setattr__(self, key: str, val: Union[Tensor, Any]): # ensure reserve keys are tensors and detached if key in {'hiddens', 'checkpoint_on', 'early_stop_on'}: self._assert_tensor_metric(key, val) @@ -119,7 +119,7 @@ def __set_meta(self, name, value, prog_bar, logger, on_step, on_epoch, reduce_fx internal = self['meta']['_internal'] internal['_reduce_on_epoch'] = max(internal['_reduce_on_epoch'], on_epoch) - def get_callback_metrics(self): + def get_callback_metrics(self) -> dict: result = { 'early_stop_on': self.early_stop_on, 'checkpoint_on': self.checkpoint_on @@ -127,7 +127,7 @@ def get_callback_metrics(self): return result - def get_batch_log_metrics(self): + def get_batch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ @@ -141,7 +141,7 @@ def get_batch_log_metrics(self): result[k] = self[k] return result - def get_epoch_log_metrics(self): + def get_epoch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ @@ -236,7 +236,7 @@ def reduce_on_epoch_end(cls, outputs): return result @property - def should_reduce_on_epoch_end(self): + def should_reduce_on_epoch_end(self) -> bool: return self['meta']['_internal']['_reduce_on_epoch'] @@ -257,7 +257,7 @@ def recursive_gather(outputs, result=None): return result -def recursive_stack(result): +def recursive_stack(result: MutableMapping): for k, v in result.items(): if isinstance(v, dict): recursive_stack(v) @@ -297,8 +297,8 @@ class EvalResult(Result): def __init__( self, - early_stop_on: Tensor = None, - checkpoint_on: Tensor = None, + early_stop_on: Optional[Tensor] = None, + checkpoint_on: Optional[Tensor] = None, hiddens: Optional[Tensor] = None ): From 7102cef1af2a1bad669aeed6ba02404e0c08e186 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 20 Jul 2020 17:55:39 +0200 Subject: [PATCH 138/168] Apply suggestions from code review Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/core/step_result.py | 55 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 6cc044ef1c220..ac520e9c214e6 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -66,9 +66,9 @@ def __setattr__(self, key: str, val: Union[Tensor, Any]): self[key] = val - def _assert_tensor_metric(self, name, x): - if x is not None and not isinstance(x, bool): - assert isinstance(x, Tensor), f'{name} must be a torch.Tensor' + def _assert_tensor_metric(self, name: str, potential_metric: Union[bool, Tensor, None, Any]): + if potential_metric is not None and not isinstance(potential_metric, bool): + assert isinstance(potential_metric, Tensor), f'{name} must be a torch.Tensor' def _assert_grad_tensor_metric(self, name, x, additional_err: str = None): if x is not None: @@ -83,12 +83,12 @@ def log( self, name, value, - prog_bar=False, - logger=True, - on_step=False, - on_epoch=True, - reduce_fx=torch.mean, - enable_graph=False, + prog_bar: bool = False, + logger: bool = True, + on_step: bool = False, + on_epoch: bool = True, + reduce_fx: Callable = torch.mean, + enable_graph: bool = False, ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): @@ -102,7 +102,16 @@ def log( # set the value self.__setitem__(name, value) - def __set_meta(self, name, value, prog_bar, logger, on_step, on_epoch, reduce_fx): + def __set_meta( + self, + name: str, + value, + prog_bar: bool, + logger: bool, + on_step: bool, + on_epoch: bool, + reduce_fx: Callable, + ): # set the meta for the item meta_value = value meta = dict( @@ -240,7 +249,7 @@ def should_reduce_on_epoch_end(self) -> bool: return self['meta']['_internal']['_reduce_on_epoch'] -def recursive_gather(outputs, result=None): +def recursive_gather(outputs: Sequence[dict], result: Optional[MutableMapping] = None) -> Optional[MutableMapping]: for out in outputs: if 'meta' in out: del out['meta'] @@ -283,12 +292,12 @@ def log( self, name, value, - prog_bar=False, - logger=True, - on_step=True, - on_epoch=False, - reduce_fx=torch.mean, - enable_graph=False, + prog_bar: bool = False, + logger: bool = True, + on_step: bool = True, + on_epoch: bool = False, + reduce_fx: Callable = torch.mean, + enable_graph: bool = False, ): super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) @@ -308,12 +317,12 @@ def log( self, name, value, - prog_bar=False, - logger=True, - on_step=False, - on_epoch=True, - reduce_fx=torch.mean, - enable_graph=False, + prog_bar: bool = False, + logger: bool = True, + on_step: bool = False, + on_epoch: bool = True, + reduce_fx: Callable = torch.mean, + enable_graph: bool = False, ): super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) From 12ef3b0f99e7aedb9fb802ba50022f063170d8df Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 20 Jul 2020 18:14:57 +0200 Subject: [PATCH 139/168] Apply suggestions from code review Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- pytorch_lightning/core/step_result.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index ac520e9c214e6..89586f6bb52b2 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -70,7 +70,7 @@ def _assert_tensor_metric(self, name: str, potential_metric: Union[bool, Tensor, if potential_metric is not None and not isinstance(potential_metric, bool): assert isinstance(potential_metric, Tensor), f'{name} must be a torch.Tensor' - def _assert_grad_tensor_metric(self, name, x, additional_err: str = None): + def _assert_grad_tensor_metric(self, name: str, x: Union[torch.Tensor, Any], additional_err: str = ''): if x is not None: assert isinstance(x, Tensor), f'{name} must be a torch.Tensor' m = f'{name} must have a computational graph.' @@ -125,8 +125,7 @@ def __set_meta( self['meta'][name] = meta # track whether any input requires reduction on epoch end - internal = self['meta']['_internal'] - internal['_reduce_on_epoch'] = max(internal['_reduce_on_epoch'], on_epoch) + self['meta']['_internal']['_reduce_on_epoch'] = max(internal['_reduce_on_epoch'], on_epoch) def get_callback_metrics(self) -> dict: result = { From 2116a61aa6ed0d4dbbe91b9549cb1e7caf63f58b Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 20 Jul 2020 18:27:30 +0200 Subject: [PATCH 140/168] simple --- pytorch_lightning/core/step_result.py | 47 +++++++++------------------ 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 89586f6bb52b2..e58f9d0c9b137 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -1,9 +1,11 @@ -from typing import Optional, Dict, Union +from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any from torch import Tensor import torch from copy import copy + + class Result(Dict): def __init__( @@ -125,7 +127,8 @@ def __set_meta( self['meta'][name] = meta # track whether any input requires reduction on epoch end - self['meta']['_internal']['_reduce_on_epoch'] = max(internal['_reduce_on_epoch'], on_epoch) + _internal = self['meta']['_internal'] + _internal['_reduce_on_epoch'] = max(_internal['_reduce_on_epoch'], on_epoch) def get_callback_metrics(self) -> dict: result = { @@ -135,7 +138,7 @@ def get_callback_metrics(self) -> dict: return result - def get_batch_log_metrics(self) -> dict: + def _get_metrics(self, opt_names: Sequence[str]) -> dict: """ Gets the metrics to log at the end of the batch step """ @@ -145,51 +148,33 @@ def get_batch_log_metrics(self) -> dict: for k, options in meta.items(): if k == '_internal': continue - if options['logger'] and options['on_step']: + if all(options[n] for n in opt_names): result[k] = self[k] return result - def get_epoch_log_metrics(self) -> dict: + def get_batch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ - result = {} + return self._get_metrics(self, opt_names=['logger', 'on_step']) - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - if options['logger'] and options['on_epoch']: - result[k] = self[k] - return result + def get_epoch_log_metrics(self) -> dict: + """ + Gets the metrics to log at the end of the batch step + """ + return self._get_metrics(self, opt_names=['logger', 'on_epoch']) def get_epoch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ - result = {} - - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - if options['prog_bar'] and options['on_epoch']: - result[k] = self[k] - return result + return self._get_metrics(self, opt_names=['prog_bar', 'on_epoch']) def get_batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ - result = {} - - meta = self['meta'] - for k, options in meta.items(): - if k == '_internal': - continue - if options['prog_bar'] and options['on_step']: - result[k] = self[k] - return result + return self._get_metrics(self, opt_names=['prog_bar', 'on_epoch']) def detach(self): for k, v in self.items(): From fd5445d1765ce09bf81f064729f645a115f69ebe Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 12:34:16 -0400 Subject: [PATCH 141/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 0c4212b66f390..e45040f8ceaaf 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -589,7 +589,7 @@ def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k): with pytest.raises(FileNotFoundError): trainer.test(ckpt_path='random.ckpt') else: - ckpt_path = str(list((Path(tmpdir) / 'lightning_logs/version_0/checkpoints').iterdir())[0].absolute()) + ckpt_path = str(list((Path(tmpdir) / f'lightning_logs/version_{trainer.logger.version}/checkpoints').iterdir())[0].absolute()) trainer.test(ckpt_path=ckpt_path) assert trainer.tested_ckpt_path == ckpt_path From 6a63fe0a21e2e706590f3799a26e7f522fd7ff63 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 20 Jul 2020 19:10:08 +0200 Subject: [PATCH 142/168] simple --- pytorch_lightning/core/step_result.py | 38 +++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index e58f9d0c9b137..139cefb7db7d5 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -4,8 +4,6 @@ from copy import copy - - class Result(Dict): def __init__( @@ -13,7 +11,7 @@ def __init__( minimize: Optional[Tensor] = None, early_stop_on: Optional[Tensor] = None, checkpoint_on: Union[Tensor, bool, None] = None, - hiddens: Optional[Tensor] = None + hiddens: Optional[Tensor] = None, ): super().__init__() @@ -83,8 +81,8 @@ def _assert_grad_tensor_metric(self, name: str, x: Union[torch.Tensor, Any], add def log( self, - name, - value, + name: str, + value: Any, prog_bar: bool = False, logger: bool = True, on_step: bool = False, @@ -107,7 +105,7 @@ def log( def __set_meta( self, name: str, - value, + value: Any, prog_bar: bool, logger: bool, on_step: bool, @@ -138,7 +136,7 @@ def get_callback_metrics(self) -> dict: return result - def _get_metrics(self, opt_names: Sequence[str]) -> dict: + def __get_meta_metrics(self, opt_names: Sequence[str]) -> dict: """ Gets the metrics to log at the end of the batch step """ @@ -156,25 +154,25 @@ def get_batch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ - return self._get_metrics(self, opt_names=['logger', 'on_step']) + return self.__get_meta_metrics(self, opt_names=['logger', 'on_step']) def get_epoch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ - return self._get_metrics(self, opt_names=['logger', 'on_epoch']) + return self.__get_meta_metrics(self, opt_names=['logger', 'on_epoch']) def get_epoch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ - return self._get_metrics(self, opt_names=['prog_bar', 'on_epoch']) + return self.__get_meta_metrics(self, opt_names=['prog_bar', 'on_epoch']) def get_batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ - return self._get_metrics(self, opt_names=['prog_bar', 'on_epoch']) + return self.__get_meta_metrics(self, opt_names=['prog_bar', 'on_epoch']) def detach(self): for k, v in self.items(): @@ -267,7 +265,7 @@ def __init__( minimize: Optional[Tensor] = None, early_stop_on: Tensor = None, checkpoint_on: Union[Tensor, bool] = None, - hiddens: Optional[Tensor] = None + hiddens: Optional[Tensor] = None, ): super().__init__(minimize, early_stop_on, checkpoint_on, hiddens) @@ -292,7 +290,7 @@ def __init__( self, early_stop_on: Optional[Tensor] = None, checkpoint_on: Optional[Tensor] = None, - hiddens: Optional[Tensor] = None + hiddens: Optional[Tensor] = None, ): super().__init__(None, early_stop_on, checkpoint_on, hiddens) @@ -311,10 +309,10 @@ def log( super().log(name, value, prog_bar, logger, on_step, on_epoch, reduce_fx, enable_graph) -if __name__ == '__main__': - import torch - result = TrainResult() - result.hiddens = torch.tensor(1) - result.log('some', 123) - print(result) - result.minimize = torch.tensor(1) +# if __name__ == '__main__': +# import torch +# result = TrainResult() +# result.hiddens = torch.tensor(1) +# result.log('some', 123) +# print(result) +# result.minimize = torch.tensor(1) From d650daf92c7e6839c9e1430152f5f63352f8c948 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 20 Jul 2020 19:23:33 +0200 Subject: [PATCH 143/168] simple --- pytorch_lightning/utilities/parsing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py index 7acaea4fd26c0..920e14bbefeb4 100644 --- a/pytorch_lightning/utilities/parsing.py +++ b/pytorch_lightning/utilities/parsing.py @@ -1,5 +1,6 @@ import inspect from argparse import Namespace +from typing import Dict def str_to_bool(val): @@ -93,7 +94,7 @@ def collect_init_args(frame, path_args: list, inside: bool = False) -> list: return path_args -class AttributeDict(dict): +class AttributeDict(Dict): """Extended dictionary accesisable with dot notation. >>> ad = AttributeDict({'key1': 1, 'key2': 'abc'}) From 6abb73a43843431869d6a3fac068675596334c17 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 20 Jul 2020 19:34:26 +0200 Subject: [PATCH 144/168] revert --- pytorch_lightning/core/step_result.py | 40 +++++++++++++++++++-------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 139cefb7db7d5..73aeb7bb39fdd 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -136,7 +136,7 @@ def get_callback_metrics(self) -> dict: return result - def __get_meta_metrics(self, opt_names: Sequence[str]) -> dict: + def get_batch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ @@ -146,33 +146,51 @@ def __get_meta_metrics(self, opt_names: Sequence[str]) -> dict: for k, options in meta.items(): if k == '_internal': continue - if all(options[n] for n in opt_names): + if options['logger'] and options['on_step']: result[k] = self[k] return result - def get_batch_log_metrics(self) -> dict: - """ - Gets the metrics to log at the end of the batch step - """ - return self.__get_meta_metrics(self, opt_names=['logger', 'on_step']) - def get_epoch_log_metrics(self) -> dict: """ Gets the metrics to log at the end of the batch step """ - return self.__get_meta_metrics(self, opt_names=['logger', 'on_epoch']) + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if k == '_internal': + continue + if options['logger'] and options['on_epoch']: + result[k] = self[k] + return result def get_epoch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ - return self.__get_meta_metrics(self, opt_names=['prog_bar', 'on_epoch']) + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if k == '_internal': + continue + if options['prog_bar'] and options['on_epoch']: + result[k] = self[k] + return result def get_batch_pbar_metrics(self): """ Gets the metrics to log at the end of the batch step """ - return self.__get_meta_metrics(self, opt_names=['prog_bar', 'on_epoch']) + result = {} + + meta = self['meta'] + for k, options in meta.items(): + if k == '_internal': + continue + if options['prog_bar'] and options['on_step']: + result[k] = self[k] + return result def detach(self): for k, v in self.items(): From 6333f2121991b79677abd6b4b478e729924d220f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 13:36:05 -0400 Subject: [PATCH 145/168] finished tests for structured results on train epoch --- pytorch_lightning/callbacks/base.py | 4 ++-- pytorch_lightning/trainer/callback_hook.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py index 37ef84c796ec2..7c1d05547790c 100644 --- a/pytorch_lightning/callbacks/base.py +++ b/pytorch_lightning/callbacks/base.py @@ -54,11 +54,11 @@ def on_train_epoch_end(self, trainer, pl_module): """Called when the train epoch ends.""" pass - def on_val_epoch_start(self, trainer, pl_module): + def on_validation_epoch_start(self, trainer, pl_module): """Called when the val epoch begins.""" pass - def on_val_epoch_end(self, trainer, pl_module): + def on_validation_epoch_end(self, trainer, pl_module): """Called when the val epoch ends.""" pass diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 6266cccc25f1e..616d500fc27b9 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -61,15 +61,15 @@ def on_train_epoch_end(self): for callback in self.callbacks: callback.on_train_epoch_end(self, self.get_model()) - def on_val_epoch_start(self): + def on_validation_epoch_start(self): """Called when the epoch begins.""" for callback in self.callbacks: - callback.on_val_epoch_start(self, self.get_model()) + callback.on_validation_epoch_start(self, self.get_model()) - def on_val_epoch_end(self): + def on_validation_epoch_end(self): """Called when the epoch begins.""" for callback in self.callbacks: - callback.on_val_epoch_end(self, self.get_model()) + callback.on_validation_epoch_end(self, self.get_model()) def on_test_epoch_start(self): """Called when the epoch begins.""" From f8591b4f0dd4140f68b65b4cea73c83772e48fe2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 13:36:58 -0400 Subject: [PATCH 146/168] finished tests for structured results on train epoch --- pytorch_lightning/core/hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index f2762e39cced0..d63698f20dadf 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -127,13 +127,13 @@ def on_train_epoch_end(self) -> None: """ # do something when the epoch ends - def on_val_epoch_start(self) -> None: + def on_validation_epoch_start(self) -> None: """ Called in the validation loop at the very beginning of the epoch. """ # do something when the epoch starts - def on_val_epoch_end(self) -> None: + def on_validation_epoch_end(self) -> None: """ Called in the training loop at the very end of the epoch. """ From ff088ca2f6191e264a7d34a4fd18b10f6459807c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 13:43:32 -0400 Subject: [PATCH 147/168] Update tests/base/deterministic_model.py Co-authored-by: Jirka Borovec --- tests/base/deterministic_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index dc4123b978921..2b892dc78e02d 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -3,7 +3,6 @@ from torch import nn from torch.utils.data import Dataset, DataLoader from pytorch_lightning import TrainResult -import pdb from pytorch_lightning.core.lightning import LightningModule From 595fd4b8f3540a03c12b6c68cabe7a59ca577e2c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 14:02:39 -0400 Subject: [PATCH 148/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index a0ddeacdb9c99..434accf05aad9 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -497,7 +497,7 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer = Trainer( default_root_dir=tmpdir, distributed_backend='dp', - gpus=2, + gpus=[0, 1], max_epochs=epochs, early_stop_callback=True, row_log_interval=2, From 74cd04973f6c3fd9eb7bc74f41ffe9f988fd01d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 20 Jul 2020 23:01:30 +0200 Subject: [PATCH 149/168] docstring typos --- pytorch_lightning/core/hooks.py | 6 +++--- pytorch_lightning/trainer/callback_hook.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index d63698f20dadf..aa4e274298034 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -135,19 +135,19 @@ def on_validation_epoch_start(self) -> None: def on_validation_epoch_end(self) -> None: """ - Called in the training loop at the very end of the epoch. + Called in the validation loop at the very end of the epoch. """ # do something when the epoch ends def on_test_epoch_start(self) -> None: """ - Called in the training loop at the very beginning of the epoch. + Called in the test loop at the very beginning of the epoch. """ # do something when the epoch starts def on_test_epoch_end(self) -> None: """ - Called in the training loop at the very end of the epoch. + Called in the test loop at the very end of the epoch. """ # do something when the epoch ends diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 616d500fc27b9..89b5e712c9190 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -57,7 +57,7 @@ def on_train_epoch_start(self): callback.on_train_epoch_start(self, self.get_model()) def on_train_epoch_end(self): - """Called when the epoch begins.""" + """Called when the epoch ends.""" for callback in self.callbacks: callback.on_train_epoch_end(self, self.get_model()) @@ -67,7 +67,7 @@ def on_validation_epoch_start(self): callback.on_validation_epoch_start(self, self.get_model()) def on_validation_epoch_end(self): - """Called when the epoch begins.""" + """Called when the epoch ends.""" for callback in self.callbacks: callback.on_validation_epoch_end(self, self.get_model()) @@ -77,7 +77,7 @@ def on_test_epoch_start(self): callback.on_test_epoch_start(self, self.get_model()) def on_test_epoch_end(self): - """Called when the epoch begins.""" + """Called when the epoch ends.""" for callback in self.callbacks: callback.on_test_epoch_end(self, self.get_model()) From fe91a2b947ab171bd211e629edd7e05a9efe8066 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 17:30:56 -0400 Subject: [PATCH 150/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 434accf05aad9..0978dc78d3f29 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -487,7 +487,7 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): model.validation_step = None model.test_step = None model.training_step = model.training_step_full_loop_result_obj_dp - model.training_step_end = model.training_step_end_full_loop_result_obj_dp + # model.training_step_end = model.training_step_end_full_loop_result_obj_dp model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp model.val_dataloader = None model.test_dataloader = None From 7dfda42c4eb1beab48c88315183b5fb050b49b08 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 17:32:07 -0400 Subject: [PATCH 151/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- tests/trainer/test_trainer_steps_result_return.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 5a30c7f2874b2..3ed06a0427444 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -46,7 +46,7 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx): """ x, y = batch x = x.view(x.size(0), -1) - y_hat = self.forward(x) + y_hat = self(x) loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) result.log('train_step_metric', loss_val + 1) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 0978dc78d3f29..434accf05aad9 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -487,7 +487,7 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): model.validation_step = None model.test_step = None model.training_step = model.training_step_full_loop_result_obj_dp - # model.training_step_end = model.training_step_end_full_loop_result_obj_dp + model.training_step_end = model.training_step_end_full_loop_result_obj_dp model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp model.val_dataloader = None model.test_dataloader = None From 4f48912e25fa50f729367612ffbc7a2f48d41032 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 17:33:20 -0400 Subject: [PATCH 152/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 3ed06a0427444..df575a80a9419 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -45,6 +45,7 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx): Full loop flow train step (result obj + dp) """ x, y = batch + print(x.device, self.device) x = x.view(x.size(0), -1) y_hat = self(x) loss_val = y_hat.sum() From de5cbb91162c44bcd4ecba35d1c6e0b974e1dede Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 17:34:24 -0400 Subject: [PATCH 153/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index df575a80a9419..71e5b9603c25f 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -45,7 +45,9 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx): Full loop flow train step (result obj + dp) """ x, y = batch + print('-' * 100) print(x.device, self.device) + print('-' * 100) x = x.view(x.size(0), -1) y_hat = self(x) loss_val = y_hat.sum() From 6ccf0cc08e22f685a13e1333fabbab01e3e03833 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:13:46 -0400 Subject: [PATCH 154/168] finished tests for structured results on train epoch --- .../test_trainer_steps_result_return.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 434accf05aad9..bbe677786614d 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -483,17 +483,9 @@ def test_use_callbacks_with_train_loop_only(tmpdir): def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' - model = EvalModelTemplate() - model.validation_step = None - model.test_step = None - model.training_step = model.training_step_full_loop_result_obj_dp - model.training_step_end = model.training_step_end_full_loop_result_obj_dp - model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp - model.val_dataloader = None - model.test_dataloader = None - batches = 3 epochs = 3 + trainer = Trainer( default_root_dir=tmpdir, distributed_backend='dp', @@ -504,6 +496,14 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): limit_train_batches=batches, weights_summary=None, ) + model = EvalModelTemplate() + model.validation_step = None + model.test_step = None + model.training_step = model.training_step_full_loop_result_obj_dp + model.training_step_end = model.training_step_end_full_loop_result_obj_dp + model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp + model.val_dataloader = None + model.test_dataloader = None trainer.fit(model) From e671a792744eb036127ad3f652c0b536697e5d4e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:17:18 -0400 Subject: [PATCH 155/168] finished tests for structured results on train epoch --- .../test_trainer_steps_result_return.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index bbe677786614d..9acca9ce820b2 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -479,6 +479,31 @@ def test_use_callbacks_with_train_loop_only(tmpdir): assert ckpt_val['monitor'] == 'checkpoint_on' +def test_xxx(tmpdir): + import tests.base.develop_pipelines as tpipes + from pytorch_lightning.core import memory + import tests.base.develop_utils as tutils + + tutils.set_random_master_port() + + trainer_options = dict( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=10, + limit_val_batches=10, + gpus=[0, 1], + distributed_backend='dp', + progress_bar_refresh_rate=0 + ) + + model = EvalModelTemplate() + + tpipes.run_model_test(trainer_options, model) + + # test memory helper functions + memory.get_memory_profile('min_max') + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' From f3ee6c23e9d289e0d4509c06ee20290ad36ad7ce Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:18:20 -0400 Subject: [PATCH 156/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 9acca9ce820b2..72f7cc01ebd34 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -508,7 +508,7 @@ def test_xxx(tmpdir): def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' - batches = 3 + batches = 10 epochs = 3 trainer = Trainer( From be99f0a3657954a8065b66b987d474e6fe92840b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:19:24 -0400 Subject: [PATCH 157/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 72f7cc01ebd34..df473e657860a 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -484,7 +484,7 @@ def test_xxx(tmpdir): from pytorch_lightning.core import memory import tests.base.develop_utils as tutils - tutils.set_random_master_port() + # tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, @@ -497,11 +497,13 @@ def test_xxx(tmpdir): ) model = EvalModelTemplate() + trainer = Trainer(**trainer_options) + trainer.fit(model) - tpipes.run_model_test(trainer_options, model) + # tpipes.run_model_test(trainer_options, model) # test memory helper functions - memory.get_memory_profile('min_max') + # memory.get_memory_profile('min_max') @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From d767547bef775a212f9b6b488a16643f9ff68106 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:20:04 -0400 Subject: [PATCH 158/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index df473e657860a..b27caba8bd2b6 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -515,13 +515,12 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer = Trainer( default_root_dir=tmpdir, - distributed_backend='dp', + max_epochs=1, + limit_train_batches=10, + limit_val_batches=10, gpus=[0, 1], - max_epochs=epochs, - early_stop_callback=True, - row_log_interval=2, - limit_train_batches=batches, - weights_summary=None, + distributed_backend='dp', + progress_bar_refresh_rate=0 ) model = EvalModelTemplate() model.validation_step = None From de16f8abcdf2e8c3cd5ec059b1f671374647fc1c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:21:07 -0400 Subject: [PATCH 159/168] finished tests for structured results on train epoch --- .../test_trainer_steps_result_return.py | 41 ++++--------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index b27caba8bd2b6..637171262d600 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -478,34 +478,6 @@ def test_use_callbacks_with_train_loop_only(tmpdir): assert ckpt_val['epoch'] == expected_epoch assert ckpt_val['monitor'] == 'checkpoint_on' - -def test_xxx(tmpdir): - import tests.base.develop_pipelines as tpipes - from pytorch_lightning.core import memory - import tests.base.develop_utils as tutils - - # tutils.set_random_master_port() - - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=10, - limit_val_batches=10, - gpus=[0, 1], - distributed_backend='dp', - progress_bar_refresh_rate=0 - ) - - model = EvalModelTemplate() - trainer = Trainer(**trainer_options) - trainer.fit(model) - - # tpipes.run_model_test(trainer_options, model) - - # test memory helper functions - # memory.get_memory_profile('min_max') - - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' @@ -515,12 +487,13 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=10, - limit_val_batches=10, - gpus=[0, 1], distributed_backend='dp', - progress_bar_refresh_rate=0 + gpus=[0, 1], + max_epochs=epochs, + early_stop_callback=True, + row_log_interval=2, + limit_train_batches=batches, + weights_summary=None, ) model = EvalModelTemplate() model.validation_step = None @@ -531,6 +504,8 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): model.val_dataloader = None model.test_dataloader = None + model = EvalModelTemplate() + trainer.fit(model) # make sure the loop was good From 072cb09a9d0b5b24af14abb710df7e8b82668cc5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:22:18 -0400 Subject: [PATCH 160/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 637171262d600..2389f33013652 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -496,15 +496,15 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): weights_summary=None, ) model = EvalModelTemplate() - model.validation_step = None - model.test_step = None + # model.validation_step = None + # model.test_step = None model.training_step = model.training_step_full_loop_result_obj_dp model.training_step_end = model.training_step_end_full_loop_result_obj_dp model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp - model.val_dataloader = None - model.test_dataloader = None + # model.val_dataloader = None + # model.test_dataloader = None - model = EvalModelTemplate() + # model = EvalModelTemplate() trainer.fit(model) From ea2676157c9efd4b88127f85a1c8fcc1b25c88b9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:23:18 -0400 Subject: [PATCH 161/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 2389f33013652..fd989a23a30f6 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -499,8 +499,8 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): # model.validation_step = None # model.test_step = None model.training_step = model.training_step_full_loop_result_obj_dp - model.training_step_end = model.training_step_end_full_loop_result_obj_dp - model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp + # model.training_step_end = model.training_step_end_full_loop_result_obj_dp + # model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp # model.val_dataloader = None # model.test_dataloader = None From f74e3b0bae9255de586b47249c25590c04b5d2c0 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:24:23 -0400 Subject: [PATCH 162/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- tests/trainer/test_trainer_steps_result_return.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 71e5b9603c25f..1f921bfabe241 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -40,7 +40,7 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): output /= 0 return output - def training_step_full_loop_result_obj_dp(self, batch, batch_idx): + def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): """ Full loop flow train step (result obj + dp) """ diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index fd989a23a30f6..637171262d600 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -496,15 +496,15 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): weights_summary=None, ) model = EvalModelTemplate() - # model.validation_step = None - # model.test_step = None + model.validation_step = None + model.test_step = None model.training_step = model.training_step_full_loop_result_obj_dp - # model.training_step_end = model.training_step_end_full_loop_result_obj_dp - # model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp - # model.val_dataloader = None - # model.test_dataloader = None + model.training_step_end = model.training_step_end_full_loop_result_obj_dp + model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp + model.val_dataloader = None + model.test_dataloader = None - # model = EvalModelTemplate() + model = EvalModelTemplate() trainer.fit(model) From cab63d457e63fed80e79f1d4c400248e36fe0c9e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:25:39 -0400 Subject: [PATCH 163/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 3 --- .../test_trainer_steps_result_return.py | 25 ++++++++----------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 1f921bfabe241..828bdb3c5fa89 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -45,9 +45,6 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx= Full loop flow train step (result obj + dp) """ x, y = batch - print('-' * 100) - print(x.device, self.device) - print('-' * 100) x = x.view(x.size(0), -1) y_hat = self(x) loss_val = y_hat.sum() diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 637171262d600..a2b03e035b25c 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -485,6 +485,15 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): batches = 10 epochs = 3 + model = EvalModelTemplate() + model.validation_step = None + model.test_step = None + model.training_step = model.training_step_full_loop_result_obj_dp + model.training_step_end = model.training_step_end_full_loop_result_obj_dp + model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp + model.val_dataloader = None + model.test_dataloader = None + trainer = Trainer( default_root_dir=tmpdir, distributed_backend='dp', @@ -495,25 +504,11 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): limit_train_batches=batches, weights_summary=None, ) - model = EvalModelTemplate() - model.validation_step = None - model.test_step = None - model.training_step = model.training_step_full_loop_result_obj_dp - model.training_step_end = model.training_step_end_full_loop_result_obj_dp - model.training_epoch_end = model.training_epoch_end_full_loop_result_obj_dp - model.val_dataloader = None - model.test_dataloader = None - - model = EvalModelTemplate() trainer.fit(model) - # make sure the loop was good - assert model.training_step_called - assert model.training_step_end_called - assert model.training_epoch_end_called - # make sure we saw all the correct keys + import pdb; pdb.set_trace() seen_keys = set() for metric in trainer.dev_debugger.logged_metrics: seen_keys.update(metric.keys()) From db26566c30295a11dad2795a1458e409f1f4cebd Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:26:34 -0400 Subject: [PATCH 164/168] finished tests for structured results on train epoch --- tests/base/model_train_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 828bdb3c5fa89..6022b864787ab 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -46,7 +46,7 @@ def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx= """ x, y = batch x = x.view(x.size(0), -1) - y_hat = self(x) + y_hat = self(x.to(self.device)) loss_val = y_hat.sum() result = TrainResult(minimize=loss_val) result.log('train_step_metric', loss_val + 1) From a1010dda35a9bb77b152201e639f2e6fe9624595 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:27:20 -0400 Subject: [PATCH 165/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index a2b03e035b25c..794e8bfc4aa62 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -508,11 +508,11 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): trainer.fit(model) # make sure we saw all the correct keys - import pdb; pdb.set_trace() seen_keys = set() for metric in trainer.dev_debugger.logged_metrics: seen_keys.update(metric.keys()) + import pdb; pdb.set_trace() assert 'train_step_metric' in seen_keys assert 'train_step_end_metric' in seen_keys assert 'train_epoch_end_metric' in seen_keys From 30e17aa1995b0dce3a26c8da1e8b47f6eddcdf84 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:28:32 -0400 Subject: [PATCH 166/168] finished tests for structured results on train epoch --- tests/trainer/test_trainer_steps_result_return.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 794e8bfc4aa62..16353bb8b20dc 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -277,8 +277,8 @@ def test_training_step_result_log_step_and_epoch(tmpdir): assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out - assert f'step_epoch_log_and_pbar_acc1' in train_step_out - assert f'step_epoch_log_acc2' in train_step_out + assert 'step_epoch_log_and_pbar_acc1' in train_step_out + assert 'step_epoch_log_acc2' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) @@ -351,8 +351,8 @@ def test_training_step_epoch_end_result(tmpdir): assert isinstance(train_step_out, TrainResult) assert 'minimize' in train_step_out - assert f'step_epoch_log_and_pbar_acc1' in train_step_out - assert f'step_epoch_log_acc2' in train_step_out + assert 'step_epoch_log_and_pbar_acc1' in train_step_out + assert 'step_epoch_log_acc2' in train_step_out # make sure the optimizer closure returns the correct things opt_closure_result = trainer.optimizer_closure(batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) @@ -478,6 +478,7 @@ def test_use_callbacks_with_train_loop_only(tmpdir): assert ckpt_val['epoch'] == expected_epoch assert ckpt_val['monitor'] == 'checkpoint_on' + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_full_train_loop_with_results_obj_dp(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' @@ -512,7 +513,6 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): for metric in trainer.dev_debugger.logged_metrics: seen_keys.update(metric.keys()) - import pdb; pdb.set_trace() assert 'train_step_metric' in seen_keys assert 'train_step_end_metric' in seen_keys assert 'train_epoch_end_metric' in seen_keys From a7f05440a8488a212139db4d10b42274fe014f28 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:42:27 -0400 Subject: [PATCH 167/168] Update pytorch_lightning/core/step_result.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/core/step_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 73aeb7bb39fdd..1dc88db15ccb5 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -23,7 +23,7 @@ def __init__( if hiddens is not None: self.hiddens = hiddens if minimize is not None: - err = 'Minimize can only be used in training_end, training_step_end, training_epoch_end' + err = 'Minimize can only be used in training_step, training_step_end, training_epoch_end' self._assert_grad_tensor_metric('minimize', minimize, err) self.minimize = minimize From 704d2019375b42588f105eea8ea32948cfeddc37 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 20 Jul 2020 18:42:36 -0400 Subject: [PATCH 168/168] Update pytorch_lightning/overrides/data_parallel.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/overrides/data_parallel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch_lightning/overrides/data_parallel.py b/pytorch_lightning/overrides/data_parallel.py index 229d9133b1efd..c9c793cc89a2f 100644 --- a/pytorch_lightning/overrides/data_parallel.py +++ b/pytorch_lightning/overrides/data_parallel.py @@ -188,8 +188,6 @@ def _worker(i, module, input, kwargs, device=None): if not isinstance(input, (list, tuple)): input = (input,) - if hasattr(module, '_device'): - module._device = device module = module.to(device) # ---------------