Skip to content

Commit

Permalink
Replace automatic nan check with optional flag (#1475)
Browse files Browse the repository at this point in the history
* Replace automatic nan check with optional flag

* Update CHANGELOG.md
  • Loading branch information
ethanwharris committed Apr 13, 2020
1 parent 3f1e4b9 commit 8544b33
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 5 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Added

- Added `auto_select_gpus` flag to trainer that enables automatic selection of available GPUs on exclusive mode systems.
- Added learining rate finder ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1347))
- Added learning rate finder ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1347))

-
- Added `terminate_on_nan` flag to trainer that performs a NaN check with each training iteration when set to `True`. ([#1475](https://github.com/PyTorchLightning/pytorch-lightning/pull/1475))

### Changed

-
- Changed the default behaviour to no longer include a NaN check with each training iteration. ([#1475](https://github.com/PyTorchLightning/pytorch-lightning/pull/1475))

-

Expand Down
5 changes: 5 additions & 0 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def __init__(
use_amp=None, # backward compatible, todo: remove in v0.9.0
show_progress_bar=None, # backward compatible, todo: remove in v0.9.0
nb_sanity_val_steps=None, # backward compatible, todo: remove in v0.8.0
terminate_on_nan: bool = False,
**kwargs
):
r"""
Expand Down Expand Up @@ -281,6 +282,9 @@ def __init__(
To use a different key, set a string instead of True with the key name.
benchmark: If true enables cudnn.benchmark.
terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the
end of each training batch, if any of the parameters or the loss are NaN or +/-inf.
"""

# Init callbacks
Expand Down Expand Up @@ -357,6 +361,7 @@ def __init__(

self.truncated_bptt_steps = truncated_bptt_steps
self.resume_from_checkpoint = resume_from_checkpoint
self.terminate_on_nan = terminate_on_nan
self.shown_warnings = set()

self.fast_dev_run = fast_dev_run
Expand Down
15 changes: 13 additions & 2 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def training_step(self, batch, batch_idx):
NaN detection and intervention
------------------------------
In every forward pass in training, Lightning will check that
When the `terminate_on_nan` flag is enabled, after every forward pass during training, Lightning will
check that
1. the loss you return in `training_step` is finite (not NaN and not +/-inf)
2. the model parameters have finite values.
Expand All @@ -130,6 +131,14 @@ def training_step(self, batch, batch_idx):
values are detected. If this happens, you should investigate numerically unstable operations
in your model.
.. code-block:: python
# DEFAULT (won't perform the NaN check)
trainer = Trainer(terminate_on_nan=False)
# (NaN check each batch and terminate on NaN or infinite values)
trainer = Trainer(terminate_on_nan=True)
"""

import copy
Expand Down Expand Up @@ -216,6 +225,7 @@ class TrainerTrainLoopMixin(ABC):
min_steps: int
total_batch_idx: int
checkpoint_callback: ...
terminate_on_nan: bool

# Callback system
callbacks: List[Callback]
Expand Down Expand Up @@ -604,7 +614,8 @@ def optimizer_closure():
loss, batch_output = optimizer_closure()

# check if loss or model weights are nan
self.detect_nan_tensors(loss)
if self.terminate_on_nan:
self.detect_nan_tensors(loss)

# track total loss for logging (avoid mem leaks)
self.batch_loss_value.append(loss)
Expand Down
2 changes: 2 additions & 0 deletions tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,7 @@ def training_step(self, batch, batch_idx):
trainer = Trainer(
default_root_dir=tmpdir,
max_steps=(test_step + 1),
terminate_on_nan=True
)

with pytest.raises(ValueError, match=r'.*The loss returned in `training_step` is nan or inf.*'):
Expand All @@ -619,6 +620,7 @@ def on_after_backward(self):
trainer = Trainer(
default_root_dir=tmpdir,
max_steps=(test_step + 1),
terminate_on_nan=True
)

with pytest.raises(ValueError, match=r'.*Detected nan and/or inf values in `c_d1.bias`.*'):
Expand Down

0 comments on commit 8544b33

Please sign in to comment.