From 760475ce9768486c2a5f85ad4d1163dbd9b0b566 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 07:47:32 -0400 Subject: [PATCH 01/13] fix result for dp --- tests/base/model_train_steps.py | 24 ++++++++------ tests/base/model_valid_steps.py | 19 ++++++++++++ .../test_trainer_steps_result_return.py | 31 +++++++++++++++++++ 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index b6fc748037af3..19cdd5d7641cb 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -66,17 +66,23 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): output /= 0 return output - def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): - """ - Full loop flow train step (result obj + dp) - """ + def training_step_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): + # forward pass x, y = batch x = x.view(x.size(0), -1) - y_hat = self(x.to(self.device)) - loss_val = y_hat.sum() - result = TrainResult(minimize=loss_val) - result.log('train_step_metric', loss_val + 1) - self.training_step_called = True + y_hat = self(x) + + # calculate loss + loss_val = self.loss(y.to(y_hat.device), y_hat) + log_val = loss_val + + # alternate between tensors and scalars for "log" and "progress_bar" + if batch_idx % 2 == 0: + log_val = log_val.item() + + result = TrainResult(loss_val) + result.log('some_val', log_val * log_val, prog_bar=True, logger=False) + result.log('train_some_val', log_val * log_val) return result def training_step_end_full_loop_result_obj_dp(self, result): diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index 74694d365545c..9f022d2cb47b4 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -52,6 +52,25 @@ def validation_step_result_obj(self, batch, batch_idx, *args, **kwargs): }) return result + def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): + x, y = batch + x = x.view(x.size(0), -1) + y_hat = self(x) + + loss_val = self.loss(y.to(y_hat.device), y_hat) + + # acc + labels_hat = torch.argmax(y_hat, dim=1) + val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) + val_acc = torch.tensor(val_acc).type_as(x) + + result = EvalResult(checkpoint_on=loss_val, early_stop_on=loss_val) + result.log_dict({ + 'val_loss': loss_val, + 'val_acc': val_acc, + }) + return result + def validation_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): """ Lightning calls this inside the validation loop diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 7b9fc080b0765..2f2d108f75281 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -534,6 +534,37 @@ def test_full_train_loop_with_results_obj_dp(tmpdir): assert 'epoch_train_epoch_end_metric' in seen_keys +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_loop_steps_only_dp(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + batches = 10 + epochs = 3 + + model = EvalModelTemplate() + model.validation_step = None + model.test_step = None + model.training_step = model.training_step_result_obj_dp + model.training_step_end = None + model.training_epoch_end = None + model.validation_step = model.validation_step_result_obj_dp + model.val_dataloader = None + model.test_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + distributed_backend='dp', + gpus=[0, 1], + max_epochs=epochs, + early_stop_callback=True, + row_log_interval=2, + limit_train_batches=batches, + weights_summary=None, + ) + + trainer.fit(model) + + def test_result_map(tmpdir): result = TrainResult() result.log_dict({'x1': torch.tensor(1), 'x2': torch.tensor(2)}) From 3692b9f3502e037aaa29970194683dafb036e245 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:02:35 -0400 Subject: [PATCH 02/13] fix result for dp --- pytorch_lightning/core/step_result.py | 7 +++++++ pytorch_lightning/trainer/training_loop.py | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index e47e186c81714..a57da182b7be0 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -357,6 +357,13 @@ def reduce_across_time(cls, time_outputs): result['meta'] = meta return result + def dp_reduce(self): + for k, value in self.items(): + if k == 'meta': + continue + + self[k] = value.mean(dim=-1) + @property def should_reduce_on_epoch_end(self) -> bool: return self['meta']['_internal']['_reduce_on_epoch'] diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index fed64c2e09aaf..185026e35063e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1221,6 +1221,8 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): else: output = self.model.training_step(*args) + is_result_obj = isinstance(output, Result) + # allow any mode to define training_step_end # do something will all the dp outputs (like softmax) if self.is_overridden('training_step_end'): @@ -1229,6 +1231,10 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): # TODO: modify when using result obj output = model_ref.training_step_end(output) + elif is_result_obj and (self.use_dp or self.use_ddp2): + import pdb; pdb.set_trace() + output.dp_reduce() + # allow any mode to define training_end # TODO: remove in 1.0.0 if self.is_overridden('training_end'): From 2331fd6599fac3240f55c249ab3b140937af987f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:04:10 -0400 Subject: [PATCH 03/13] fix result for dp --- pytorch_lightning/core/step_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index a57da182b7be0..27557453dca4b 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -362,7 +362,7 @@ def dp_reduce(self): if k == 'meta': continue - self[k] = value.mean(dim=-1) + self[k] = torch.stack(value).mean(dim=-1) @property def should_reduce_on_epoch_end(self) -> bool: From 110fa8703e1e1a4d3a8bb7b307777caddeea3256 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:05:00 -0400 Subject: [PATCH 04/13] fix result for dp --- pytorch_lightning/core/step_result.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 27557453dca4b..3a4574bb83b3d 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -361,8 +361,9 @@ def dp_reduce(self): for k, value in self.items(): if k == 'meta': continue - - self[k] = torch.stack(value).mean(dim=-1) + if isinstance(value, list): + value = torch.tensor(value) + self[k] = value.mean(dim=-1) @property def should_reduce_on_epoch_end(self) -> bool: From 3e570f535ee49a0e8e7730b600d49ba28e90ab23 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:05:41 -0400 Subject: [PATCH 05/13] fix result for dp --- pytorch_lightning/trainer/training_loop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 185026e35063e..8577f98f9d65f 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -1232,7 +1232,6 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): output = model_ref.training_step_end(output) elif is_result_obj and (self.use_dp or self.use_ddp2): - import pdb; pdb.set_trace() output.dp_reduce() # allow any mode to define training_end From eb802d8861e0f945b38e57a1c804edb50dc0a813 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:08:15 -0400 Subject: [PATCH 06/13] fix result for dp --- tests/base/model_train_steps.py | 3 +++ tests/base/model_valid_steps.py | 2 ++ tests/trainer/test_trainer_steps_result_return.py | 3 +++ 3 files changed, 8 insertions(+) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 19cdd5d7641cb..b2ccb4824bd19 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -83,6 +83,9 @@ def training_step_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): result = TrainResult(loss_val) result.log('some_val', log_val * log_val, prog_bar=True, logger=False) result.log('train_some_val', log_val * log_val) + + self.training_step_called = True + return result def training_step_end_full_loop_result_obj_dp(self, result): diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index 9f022d2cb47b4..841ec3a2e4eda 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -69,6 +69,8 @@ def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): 'val_loss': loss_val, 'val_acc': val_acc, }) + + self.validation_step_called = True return result def validation_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 2f2d108f75281..19f3d3415cd3c 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -564,6 +564,9 @@ def test_loop_steps_only_dp(tmpdir): trainer.fit(model) + assert model.training_step_called + assert model.validation_step_called + def test_result_map(tmpdir): result = TrainResult() From c9fb8f912aa41f14a877234be30f292c46cce713 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:09:26 -0400 Subject: [PATCH 07/13] fix result for dp --- tests/trainer/test_trainer_steps_result_return.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index 19f3d3415cd3c..dbce4f77c06d5 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -548,7 +548,6 @@ def test_loop_steps_only_dp(tmpdir): model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_obj_dp - model.val_dataloader = None model.test_dataloader = None trainer = Trainer( From 28cf69e7ccea4cd0ef7036619ad2d211a9c9dfe3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:10:35 -0400 Subject: [PATCH 08/13] fix result for dp --- tests/base/model_valid_steps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index 841ec3a2e4eda..61d95d0417d42 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -57,7 +57,8 @@ def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): x = x.view(x.size(0), -1) y_hat = self(x) - loss_val = self.loss(y.to(y_hat.device), y_hat) + y = y.to(y_hat.device) + loss_val = self.loss(y, y_hat) # acc labels_hat = torch.argmax(y_hat, dim=1) From 8148c70f149354c3b5d48bba919fe4420e5a19b6 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:12:21 -0400 Subject: [PATCH 09/13] fix result for dp --- tests/base/model_valid_steps.py | 2 +- tests/trainer/test_trainer_steps_result_return.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index 61d95d0417d42..c71a8228b1a90 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -71,7 +71,7 @@ def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): 'val_acc': val_acc, }) - self.validation_step_called = True + # self.validation_step_called = True return result def validation_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): diff --git a/tests/trainer/test_trainer_steps_result_return.py b/tests/trainer/test_trainer_steps_result_return.py index dbce4f77c06d5..2dc63bb40ffb9 100644 --- a/tests/trainer/test_trainer_steps_result_return.py +++ b/tests/trainer/test_trainer_steps_result_return.py @@ -548,6 +548,8 @@ def test_loop_steps_only_dp(tmpdir): model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_obj_dp + model.validation_step_end = None + model.validation_epoch_end = None model.test_dataloader = None trainer = Trainer( From c1c913103cc49ac27de4bf8fe104711a57f6d9ea Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:17:31 -0400 Subject: [PATCH 10/13] fix result for dp --- pytorch_lightning/trainer/evaluation_loop.py | 23 +++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index d7f5f3eb0fd69..4ef1b34b3c3fa 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -343,17 +343,20 @@ def _evaluate( m = 'only EvalResults or dicts are allowed from validation_step' raise MisconfigurationException(m) + # ------------------ + # EVAL STEP END + # ------------------ # on dp / ddp2 might still want to do something with the batch parts - if test_mode: - if self.is_overridden('test_step_end'): - model_ref = self.get_model() - with self.profiler.profile('test_step_end'): - output = model_ref.test_step_end(output) - else: - if self.is_overridden('validation_step_end'): - model_ref = self.get_model() - with self.profiler.profile('validation_step_end'): - output = model_ref.validation_step_end(output) + eval_step_end_hook_name = 'test_step_end' if test_mode else 'validation_step_end' + if self.is_overridden(eval_step_end_hook_name): + model_ref = self.get_model() + with self.profiler.profile(eval_step_end_hook_name): + eval_step_end = getattr(model_ref, eval_step_end_hook_name) + output = eval_step_end(output) + + elif is_result_obj and (self.use_dp or self.use_ddp2): + # result auto reduce + output.dp_reduce() # callbacks (on __batch_end) if test_mode: From 197af434efda2c1909664049b61d5875aa861415 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:18:23 -0400 Subject: [PATCH 11/13] fix result for dp --- tests/base/model_valid_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index c71a8228b1a90..61d95d0417d42 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -71,7 +71,7 @@ def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): 'val_acc': val_acc, }) - # self.validation_step_called = True + self.validation_step_called = True return result def validation_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): From a5775073276665bd317b9d7ffe3903c7e4083a43 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 08:23:16 -0400 Subject: [PATCH 12/13] fix result for dp --- tests/base/model_train_steps.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index b2ccb4824bd19..0229c1960edb4 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -66,6 +66,19 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): output /= 0 return output + def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): + """ + Full loop flow train step (result obj + dp) + """ + x, y = batch + x = x.view(x.size(0), -1) + y_hat = self(x.to(self.device)) + loss_val = y_hat.sum() + result = TrainResult(minimize=loss_val) + result.log('train_step_metric', loss_val + 1) + self.training_step_called = True + return result + def training_step_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): # forward pass x, y = batch From 4e31d594919789aca6288f8ff7565fff13dffc0a Mon Sep 17 00:00:00 2001 From: William Falcon Date: Mon, 17 Aug 2020 09:16:05 -0400 Subject: [PATCH 13/13] added warning when changing monitor and using results obj --- tests/base/model_train_steps.py | 2 +- tests/base/model_valid_steps.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index 0229c1960edb4..6d7cd365d8c25 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -83,7 +83,7 @@ def training_step_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): # forward pass x, y = batch x = x.view(x.size(0), -1) - y_hat = self(x) + y_hat = self(x.to(self.device)) # calculate loss loss_val = self.loss(y.to(y_hat.device), y_hat) diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index 61d95d0417d42..94b843e96a02a 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -55,7 +55,7 @@ def validation_step_result_obj(self, batch, batch_idx, *args, **kwargs): def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): x, y = batch x = x.view(x.size(0), -1) - y_hat = self(x) + y_hat = self(x.to(self.device)) y = y.to(y_hat.device) loss_val = self.loss(y, y_hat)