Do not override the logged epoch in logged_metrics (#7982)

Lightning-AI · Jun 16, 2021 · bc2c2db · bc2c2db
1 parent 2134216
commit bc2c2db
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -164,6 +164,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
 - Changed `WandbLogger(log_model={True/'all'})` to log models as artifacts ([#6231](https://github.com/PyTorchLightning/pytorch-lightning/pull/6231))
+
+
 - MLFlowLogger now accepts `run_name` as an constructor argument ([#7622](https://github.com/PyTorchLightning/pytorch-lightning/issues/7622))
 
 
@@ -255,6 +257,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed a bug where `precision=64` with `accelerator='ddp_spawn'` would throw a pickle error ([#6924](https://github.com/PyTorchLightning/pytorch-lightning/pull/6924))
 
 
+- Do not override the existing `epoch` value in `logged_metrics` when already logged by the user ([#7982](https://github.com/PyTorchLightning/pytorch-lightning/issues/7982))
+
+
 - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970))
 
 

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -91,6 +91,9 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -
             step: Step for which metrics should be logged. Default value is `self.global_step` during training or
                 the total validation / test log step count during validation and testing.
         """
+        if self.trainer.logger is None or not metrics:
+            return
+
         # add gpu memory
         if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
             mem_map = memory.get_memory_profile(self.log_gpu_memory)
@@ -99,21 +102,19 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -
         # turn all tensors to scalars
         scalar_metrics = metrics_to_scalars(metrics)
 
-        if "step" in scalar_metrics and step is None:
-            step = scalar_metrics.pop("step")
-
-        elif step is None:
-            # added metrics by Lightning for convenience
-            scalar_metrics['epoch'] = self.trainer.current_epoch
+        if step is None:
+            step = scalar_metrics.pop("step", None)
+        if step is None:
+            # added metrics for convenience
+            scalar_metrics.setdefault("epoch", self.trainer.current_epoch)
             step = self.trainer.global_step
 
         # log actual metrics
-        if self.trainer.logger is not None:
-            if self.trainer.is_global_zero:
-                self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step)
-                self.trainer.logger.save()
+        if self.trainer.is_global_zero:
+            self.trainer.logger.agg_and_log_metrics(scalar_metrics, step=step)
+            self.trainer.logger.save()
 
-            self._logged_metrics.update(scalar_metrics)
+        self._logged_metrics.update(scalar_metrics)
 
     """
     Evaluation metric updates
@@ -149,9 +150,7 @@ def update_eval_step_metrics(self) -> None:
 
         # logs user requested information to logger
         assert not self._epoch_end_reached
-        metrics = self.metrics[MetricSource.LOG]
-        if metrics:
-            self.log_metrics(metrics, step=self._eval_log_step)
+        self.log_metrics(self.metrics[MetricSource.LOG], step=self._eval_log_step)
 
         # increment the step even if nothing was logged
         self._increment_eval_log_step()
@@ -179,9 +178,7 @@ def update_eval_epoch_metrics(self) -> _EVALUATE_OUTPUT:
 
         if not self.trainer.sanity_checking:
             # log all the metrics as a single dict
-            log_metrics = metrics[MetricSource.LOG]
-            if log_metrics:
-                self.log_metrics(log_metrics)
+            self.log_metrics(metrics[MetricSource.LOG])
 
         self._prepare_eval_loop_results(metrics[MetricSource.CALLBACK])
 
@@ -219,16 +216,13 @@ def update_train_step_metrics(self) -> None:
 
         # when metrics should be logged
         assert not self._epoch_end_reached
-        metrics = self.metrics[MetricSource.LOG]
-        if self.should_update_logs or self.trainer.fast_dev_run is True and metrics:
-            self.log_metrics(metrics)
+        if self.should_update_logs or self.trainer.fast_dev_run:
+            self.log_metrics(self.metrics[MetricSource.LOG])
 
     def update_train_epoch_metrics(self) -> None:
         # add the metrics to the loggers
         assert self._epoch_end_reached
-        metrics = self.metrics[MetricSource.LOG]
-        if metrics:
-            self.log_metrics(metrics)
+        self.log_metrics(self.metrics[MetricSource.LOG])
 
         # reset result collection for next epoch
         self.trainer._results.reset(metrics=True)

diff --git a/tests/trainer/logging_/test_logger_connector.py b/tests/trainer/logging_/test_logger_connector.py
@@ -492,3 +492,19 @@ def test_result_collection_on_tensor_with_mean_reduction():
         'loss_on_step_on_epoch_prog_bar_logger': mean,
         'loss_on_step_on_epoch_prog_bar_logger_epoch': mean
     }
+
+
+def test_logged_metrics_has_logged_epoch_value(tmpdir):
+
+    class TestModel(BoringModel):
+
+        def training_step(self, batch, batch_idx):
+            self.log('epoch', -batch_idx, logger=True)
+            return super().training_step(batch, batch_idx)
+
+    model = TestModel()
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2)
+    trainer.fit(model)
+
+    # should not get overridden if logged manually
+    assert trainer.logged_metrics == {'epoch': -1}