Lightning-AI · williamFalcon · Mar 4, 2020 · Mar 4, 2020 · Mar 4, 2020 · Mar 4, 2020
@@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Support for user defined callbacks ([#889](https://github.com/PyTorchLightning/pytorch-lightning/pull/889) and [#950](https://github.com/PyTorchLightning/pytorch-lightning/pull/950))
 - Added support for multiple loggers to be passed to `Trainer` as an iterable (e.g. list, tuple, etc.) ([#903](https://github.com/PyTorchLightning/pytorch-lightning/pull/903))
 - Added support for logging hparams as dict ([#1029](https://github.com/PyTorchLightning/pytorch-lightning/pull/1029))
+- Checkpoint and early stopping now work without val step ([#1041](https://github.com/PyTorchLightning/pytorch-lightning/pull/1041))
 
 ### Changed
 

@@ -25,10 +25,15 @@ class ModelCheckpoint(Callback):
 
             Example::
 
+                # no path
+                ModelCheckpoint()
+                #  saves like /my/path/epoch_0.ckpt
+
                 # save epoch and val_loss in name
                 ModelCheckpoint(filepath='{epoch:02d}-{val_loss:.2f}.hdf5')
-
                 # saves file like: /my/path/here/sample-mnist_epoch=02_val_loss=0.32.ckpt
+
+
                 # if model already exits, the file will be: /my/path/here/sample-mnist-v0_epoch=02_val_loss=0.32.ckpt
 
 
@@ -146,10 +151,13 @@ def check_monitor_top_k(self, current: float) -> bool:
         return self.monitor_op(current, self.best_k_models[self.kth_best_model])
 
     def _get_available_filepath(self, current: float, epoch: int) -> str:
-        current_str = f'{current:.2f}' if current else 'NaN'
-        fname = f'{self.prefix}_epoch={epoch}_{self.monitor}={current_str}'
-        filepath = os.path.join(self.dirpath, fname + self.EXTENSION)
-        assert not os.path.isfile(filepath)
+        try:
+            current_str = f'{current:.2f}' if current else 'NaN'
+            fname = f'{self.prefix}_epoch={epoch}_{self.monitor}={current_str}'
+            filepath = os.path.join(self.dirpath, fname + self.EXTENSION)
+            assert not os.path.isfile(filepath)
+        except Exception as e:
+            import pdb; pdb.set_trace()
         return filepath
 
     def on_validation_end(self, trainer, pl_module) -> None:

@@ -48,9 +48,14 @@ def configure_checkpoint_callback(self):
             else:
                 ckpt_path = os.path.join(self.default_save_path, "checkpoints")
 
+            # when no val step is defined, use 'loss' otherwise 'val_loss'
+            train_step_only = not self.is_overriden('validation_step')
+            monitor_key = 'loss' if train_step_only else 'val_loss'
+
             self.ckpt_path = ckpt_path
             self.checkpoint_callback = ModelCheckpoint(
-                dirpath=ckpt_path
+                dirpath=ckpt_path,
+                monitor=monitor_key
             )
         elif self.checkpoint_callback is False:
             self.checkpoint_callback = None

@@ -165,7 +165,6 @@ class TrainerEvaluationLoopMixin(ABC):
     process_output: ...
     training_tqdm_dict: ...
     proc_rank: int
-    checkpoint_callback: ...
     current_epoch: int
     callback_metrics: ...
     test_dataloaders: DataLoader
@@ -377,11 +376,6 @@ def run_evaluation(self, test_mode: bool = False):
         # Validation/Test end callbacks
         if test_mode:
             self.on_test_end()
-        else:
-            # model checkpointing
-            if self.checkpoint_callback is not None:
-                self.checkpoint_callback.on_validation_end(self, self.get_model())
-            self.on_validation_end()
 
     def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: bool = False):
         # make dataloader_idx arg in validation_step optional

@@ -1132,9 +1132,6 @@ def run_pretrain_routine(self, model: LightningModule):
             # wait for all processes to catch up
             torch_xla.core.xla_model.rendezvous("pl.Trainer.run_pretrain_routine")
 
-        # set up checkpoint callback
-        self.configure_checkpoint_callback()
-
         # register auto-resubmit when on SLURM
         self.register_slurm_signal_handlers()
 
@@ -1151,6 +1148,9 @@ def run_pretrain_routine(self, model: LightningModule):
         # if cluster resets state, the model will update with the saved weights
         self.model = model
 
+        # set up checkpoint callback
+        self.configure_checkpoint_callback()
+
         # restore training and model before hpc call
         self.restore_weights(model)
 

@@ -203,6 +203,7 @@ class TrainerTrainLoopMixin(ABC):
     max_steps: int
     max_steps: int
     total_batch_idx: int
+    checkpoint_callback: ...
-    checkpoint_callback: ...
-    checkpoint_callback: ...
 
     # Callback system
     callbacks: List[Callback]
@@ -212,6 +213,7 @@ class TrainerTrainLoopMixin(ABC):
     on_batch_end: Callable
     on_epoch_start: Callable
     on_epoch_end: Callable
+    on_validation_end: Callable
 
     @property
     def max_nb_epochs(self):
@@ -454,9 +456,6 @@ def run_training_epoch(self):
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=self.testing)
 
-                if self.enable_early_stop:
-                    self.early_stop_callback.check_metrics(self.callback_metrics)
-
             # when logs should be saved
             should_save_log = (batch_idx + 1) % self.log_save_interval == 0 or early_stop_epoch
             if should_save_log or self.fast_dev_run:
@@ -469,11 +468,33 @@ def run_training_epoch(self):
                 # logs user requested information to logger
                 self.log_metrics(batch_step_metrics, grad_norm_dic)
 
+            # ---------------
+            # CHECKPOINTING, EARLY STOPPING
+            # ---------------
+            # save checkpoint even when no test or val step are defined
+            train_step_only = not self.is_overriden('validation_step')
+            if self.fast_dev_run or should_check_val or train_step_only:
+                self.call_checkpoint_callback()
+
+                if self.enable_early_stop:
+                    self.early_stop_callback.check_metrics(self.callback_metrics)
+
             # progress global step according to grads progress
             if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
                 self.global_step += 1
             self.total_batch_idx += 1
 
+            # ---------------
+            # CHECKPOINTING, EARLY STOPPING
+            # ---------------
+            # save checkpoint even when no test or val step are defined
+            train_step_only = not self.is_overriden('validation_step')
+            if self.fast_dev_run or should_check_val or train_step_only:
+                self.call_checkpoint_callback()
+
+                if self.enable_early_stop:
+                    self.early_stop_callback.check_metrics(self.callback_metrics)
+
             # max steps reached, end training
             if self.max_steps is not None and self.max_steps == self.global_step:
                 break
@@ -705,3 +726,8 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens):
         output = self.process_output(output, train=True)
 
         return output
+
+    def call_checkpoint_callback(self):
+        if self.checkpoint_callback is not None:
+            self.checkpoint_callback.on_validation_end(self, self.get_model())
+        self.on_validation_end()
@@ -26,6 +26,27 @@
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.utilities.debugging import MisconfigurationException
 
+def test_hparams_save_load(tmpdir):
+    model = DictHparamsModel({'in_features': 28 * 28, 'out_features': 10})
+
+    # logger file to get meta
+    trainer_options = dict(
+        default_save_path=tmpdir,
+        max_epochs=2,
+    )
+
+    # fit model
+    trainer = Trainer(**trainer_options)
+    result = trainer.fit(model)
+
+    assert result == 1
+
+    # try to load the model now
+    pretrained_model = tutils.load_model_from_checkpoint(
+        trainer.checkpoint_callback.dirpath,
+        module_class=DictHparamsModel
+    )
+
 
 def test_no_val_module(tmpdir):
     """Tests use case where trainer saves the model, and user loads it from tags independently."""
@@ -126,7 +147,8 @@ def test_gradient_accumulation_scheduling(tmpdir):
         assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5})
 
     # test optimizer call freq matches scheduler
-    def _optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
+    def _optimizer_step(self, epoch, batch_idx, optimizer,
+                        optimizer_idx, second_order_closure=None):
         # only test the first 12 batches in epoch
         if batch_idx < 12:
             if epoch == 0:
@@ -620,25 +642,3 @@ def test_default_args(tmpdir):
 
     assert isinstance(trainer, Trainer)
     assert trainer.max_epochs == 5
-
-
-def test_hparams_save_load(tmpdir):
-    model = DictHparamsModel({'in_features': 28 * 28, 'out_features': 10})
-
-    # logger file to get meta
-    trainer_options = dict(
-        default_save_path=tmpdir,
-        max_epochs=2,
-    )
-
-    # fit model
-    trainer = Trainer(**trainer_options)
-    result = trainer.fit(model)
-
-    assert result == 1
-
-    # try to load the model now
-    pretrained_model = tutils.load_model_from_checkpoint(
-        trainer.checkpoint_callback.dirpath,
-        module_class=DictHparamsModel
-    )