From fd4264114066cda3740365bc652c91a3901f6908 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 11:11:27 -0400 Subject: [PATCH 01/17] adding native amp suppport --- .../trainer/auto_mix_precision.py | 32 ++++++++++++++++++- pytorch_lightning/trainer/trainer.py | 18 ++--------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/trainer/auto_mix_precision.py b/pytorch_lightning/trainer/auto_mix_precision.py index 135cf83e288c8..817c96e963492 100644 --- a/pytorch_lightning/trainer/auto_mix_precision.py +++ b/pytorch_lightning/trainer/auto_mix_precision.py @@ -1,6 +1,8 @@ from abc import ABC +import torch from pytorch_lightning import _logger as log +from pytorch_lightning.utilities import rank_zero_warn try: from apex import amp @@ -15,8 +17,36 @@ class TrainerAMPMixin(ABC): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class precision: int + use_native_amp: bool - def init_amp(self, use_amp): + def init_amp(self, use_amp, amp_level, precision): + # 16 bit mixed precision training using apex + self.use_native_amp = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") + self.amp_level = amp_level + self.precision = precision + + # TODO: remove in v 0.8.0 + if self.use_native_amp: + rank_zero_warn("`amp_level` has been deprecated since v0.7.4 " + "(native amp does not require it)" + " and this argument will be removed in v0.8.0", DeprecationWarning) + + # Backward compatibility, TODO: remove in v0.9.0 + if use_amp is not None: + rank_zero_warn("`use_amp` has been replaced by `precision` since v0.7.0" + " and this argument will be removed in v0.9.0", DeprecationWarning) + self.precision = 16 if use_amp else 32 + + assert self.precision in (16, 32), 'only 32 or 16 bit precision supported' + + if self.precision == 16 and self.num_tpu_cores is None: + use_amp = True + + if use_amp and self.use_native_amp: + log.info('Using 16bit precision.') + return + + # TODO: remove all below for v0.8.0 if use_amp and not APEX_AVAILABLE: # pragma: no-cover raise ModuleNotFoundError(""" You set `use_amp=True` but do not have apex installed. diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 20ef14ca3cb50..aed4d48fe342d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -115,7 +115,6 @@ def __init__( print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0 weights_summary: Optional[str] = 'full', weights_save_path: Optional[str] = None, - amp_level: str = 'O1', num_sanity_val_steps: int = 5, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, @@ -124,6 +123,7 @@ def __init__( reload_dataloaders_every_epoch: bool = False, auto_lr_find: Union[bool, str] = False, replace_sampler_ddp: bool = True, + amp_level: str = 'O1', # backward compatible, todo: remove in v0.8.0 default_save_path=None, # backward compatible, todo: remove in v0.8.0 gradient_clip=None, # backward compatible, todo: remove in v0.8.0 nb_gpu_nodes=None, # backward compatible, todo: remove in v0.8.0 @@ -487,21 +487,7 @@ def __init__( self.determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) - # 16 bit mixed precision training using apex - self.amp_level = amp_level - self.precision = precision - - # Backward compatibility, TODO: remove in v0.9.0 - if use_amp is not None: - rank_zero_warn("`use_amp` has been replaced by `precision` since v0.7.0" - " and this argument will be removed in v0.9.0", DeprecationWarning) - self.precision = 16 if use_amp else 32 - - assert self.precision in (16, 32), 'only 32 or 16 bit precision supported' - - if self.precision == 16 and self.num_tpu_cores is None: - use_amp = True - self.init_amp(use_amp) + self.init_amp(use_amp, amp_level, precision) # Callback system self.on_init_end() From ea1650a0a98fc3c90d1d4bd603c09300c867929b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 11:31:13 -0400 Subject: [PATCH 02/17] adding native amp suppport --- pytorch_lightning/trainer/auto_mix_precision.py | 17 +++++++---------- pytorch_lightning/trainer/distrib_parts.py | 11 +++++++++-- pytorch_lightning/trainer/trainer.py | 12 +++++++++++- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pytorch_lightning/trainer/auto_mix_precision.py b/pytorch_lightning/trainer/auto_mix_precision.py index 817c96e963492..8d63ec9f4a71f 100644 --- a/pytorch_lightning/trainer/auto_mix_precision.py +++ b/pytorch_lightning/trainer/auto_mix_precision.py @@ -19,12 +19,7 @@ class TrainerAMPMixin(ABC): precision: int use_native_amp: bool - def init_amp(self, use_amp, amp_level, precision): - # 16 bit mixed precision training using apex - self.use_native_amp = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") - self.amp_level = amp_level - self.precision = precision - + def init_amp(self, use_amp): # TODO: remove in v 0.8.0 if self.use_native_amp: rank_zero_warn("`amp_level` has been deprecated since v0.7.4 " @@ -39,9 +34,6 @@ def init_amp(self, use_amp, amp_level, precision): assert self.precision in (16, 32), 'only 32 or 16 bit precision supported' - if self.precision == 16 and self.num_tpu_cores is None: - use_amp = True - if use_amp and self.use_native_amp: log.info('Using 16bit precision.') return @@ -61,4 +53,9 @@ def init_amp(self, use_amp, amp_level, precision): @property def use_amp(self) -> bool: - return self.precision == 16 and APEX_AVAILABLE + if self.use_native_amp: + return self.precision == 16 + + # TODO: remove in v0.8.0 + else: + return self.precision == 16 and APEX_AVAILABLE diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 7ce61bbfb77e6..47a32eac3b216 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -394,6 +394,7 @@ class TrainerDPMixin(ABC): tpu_local_core_rank: int tpu_global_core_rank: int use_tpu: bool + use_native_amp: bool data_parallel_device_ids: ... logger: Union[LightningLoggerBase, bool] @@ -481,7 +482,8 @@ def single_gpu_train(self, model): # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) - if self.use_amp: + # TODO: update for 0.8.0 + if self.use_amp and not self.use_native_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers @@ -528,9 +530,14 @@ def dp_train(self, model): model.cuda(self.root_gpu) + # TODO: remove in v0.8.0 + if self.use_amp and self.use_native_amp: + pass + + # TODO: remove in v0.8.0 # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 - if self.use_dp and self.use_amp: + if self.use_dp and self.use_amp and not self.use_native_amp: if self.amp_level == 'O2': raise MisconfigurationException( f'Amp level {self.amp_level} with DataParallel is not supported.' diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index aed4d48fe342d..e8b7d5b14a675 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -487,7 +487,17 @@ def __init__( self.determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) - self.init_amp(use_amp, amp_level, precision) + # AMP init + # These are the only lines needed after v0.8.0 + self.use_native_amp = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") + if self.use_native_amp and self.precision == 16: + self.scaler = torch.cuda.amp.GradScaler() + self.precision = precision + + # TODO: remove for v0.8.0 + self.amp_level = amp_level + self.precision = precision + self.init_amp(use_amp) # Callback system self.on_init_end() From 807033d6d0ea7101c993e94de3d3b3d1b7eda1b4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 11:33:46 -0400 Subject: [PATCH 03/17] adding native amp suppport --- pytorch_lightning/trainer/distrib_data_parallel.py | 5 +++-- pytorch_lightning/trainer/distrib_parts.py | 4 ---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index bfc85ee883f6e..736af5cad928f 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -151,6 +151,7 @@ class TrainerDDPMixin(ABC): amp_level: str use_tpu: bool default_root_dir: str + use_native_amp: bool @property @abstractmethod @@ -350,8 +351,8 @@ def ddp_train(self, process_idx, model): # AMP # run through amp wrapper before going to distributed DP - if self.use_amp: - # An example + # TODO: remove in v0.8.0 + if self.use_amp and not self.use_native_amp: model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 47a32eac3b216..5863fccbb5e5e 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -530,10 +530,6 @@ def dp_train(self, model): model.cuda(self.root_gpu) - # TODO: remove in v0.8.0 - if self.use_amp and self.use_native_amp: - pass - # TODO: remove in v0.8.0 # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 From d6983281433c13a5c32703d9fafca874df7e16e2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 11:45:24 -0400 Subject: [PATCH 04/17] adding native amp suppport --- pytorch_lightning/core/hooks.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 67b67ec1e8ee6..00881c42ba149 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -140,9 +140,15 @@ def backward(self, use_amp, loss, optimizer): """ if trainer.precision == 16: - # .backward is not special on 16-bit with TPUs - if not trainer.on_tpu: + if trainer.on_tpu: + return + + if self.use_native_amp: + self.scaler.scale(loss).backward() + + # TODO: remove in v0.8.0 + else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: From 794df48efd5242c135048dc98720501475235c61 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 11:56:40 -0400 Subject: [PATCH 05/17] autocast --- pytorch_lightning/trainer/evaluation_loop.py | 6 +++++- pytorch_lightning/trainer/training_loop.py | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index a996bd7a60d70..534c156ee7c6e 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -268,7 +268,11 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ # ----------------- # RUN EVALUATION STEP # ----------------- - output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode) + if self.use_amp and self.use_native_amp: + with torch.cuda.amp.autocast(): + output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode) + else: + output = self.evaluation_forward(model, batch, batch_idx, dataloader_idx, test_mode) # on dp / ddp2 might still want to do something with the batch parts if test_mode: diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 5b3d13c72b5f1..f464f0d4b63e4 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -148,6 +148,7 @@ def training_step(self, batch, batch_idx): import numpy as np from torch.utils.data import DataLoader +import torch from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback @@ -588,8 +589,12 @@ def run_training_batch(self, batch, batch_idx): def optimizer_closure(): # forward pass with self.profiler.profile('model_forward'): - output_dict = self.training_forward( - split_batch, batch_idx, opt_idx, self.hiddens) + if self.use_amp and self.use_native_amp: + with torch.cuda.amp.autocast(): + output_dict = self.training_forward(split_batch, batch_idx, + opt_idx, self.hiddens) + else: + output_dict = self.training_forward(split_batch, batch_idx, opt_idx, self.hiddens) # format and reduce outputs accordingly processed_output = self.process_output(output_dict, train=True) From fb6e4144c4218cf49abf19f59f6a92e84d378025 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 11:59:48 -0400 Subject: [PATCH 06/17] autocast --- pytorch_lightning/trainer/training_loop.py | 2 ++ pytorch_lightning/trainer/training_tricks.py | 1 + 2 files changed, 3 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index f464f0d4b63e4..4b9e906d32f29 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -650,6 +650,8 @@ def optimizer_closure(): self.track_grad_norm) # clip gradients + if self.use_amp and self.use_native_amp: + self.scaler.unscale_(optimizer) self.clip_gradients() # calls .step(), .zero_grad() diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py index 3364c9d305455..0d86d53b7bbc4 100644 --- a/pytorch_lightning/trainer/training_tricks.py +++ b/pytorch_lightning/trainer/training_tricks.py @@ -24,6 +24,7 @@ def get_model(self): """Warning: this is just empty shell for code implemented in other class.""" def clip_gradients(self): + # this code is a modification of torch.nn.utils.clip_grad_norm_ # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md if self.gradient_clip_val > 0: From ba02a20ad20a699176363413c6d083bcc9810b36 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 12:21:10 -0400 Subject: [PATCH 07/17] autocast --- pytorch_lightning/core/hooks.py | 3 +++ pytorch_lightning/core/lightning.py | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 00881c42ba149..351463a251eba 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -138,6 +138,8 @@ def backward(self, use_amp, loss, optimizer): else: loss.backward() + .. note:: with PyTorch 1.6+ + precision=16 + multiple optimizers, set .backward(retrain_graph=True) + """ if trainer.precision == 16: # .backward is not special on 16-bit with TPUs @@ -145,6 +147,7 @@ def backward(self, use_amp, loss, optimizer): return if self.use_native_amp: + # don't forget to retain graph on backward with multiple optimizers self.scaler.scale(loss).backward() # TODO: remove in v0.8.0 diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index bde43a6a0f8f6..632fe811f601d 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1159,7 +1159,14 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, elif isinstance(optimizer, torch.optim.LBFGS): optimizer.step(second_order_closure) else: - optimizer.step() + if self.use_amp and self.use_native_amp: + self.trainer.scaler.step(optimizer) + else: + optimizer.step() + + # in native 16-bit we need to update scaler after optimizer step + if self.use_amp and self.use_native_amp: + self.trainer.scaler.update() # model hook self.on_before_zero_grad(optimizer) From ee6299e2940b483a99f161e4dc9bcf5918d94c3c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 12:33:04 -0400 Subject: [PATCH 08/17] autocast --- pytorch_lightning/core/hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 351463a251eba..8264241aab1f4 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -146,9 +146,9 @@ def backward(self, use_amp, loss, optimizer): if trainer.on_tpu: return - if self.use_native_amp: + if self.trainer.use_native_amp: # don't forget to retain graph on backward with multiple optimizers - self.scaler.scale(loss).backward() + self.trainer.scaler.scale(loss).backward() # TODO: remove in v0.8.0 else: From 4d0604004ab036d818a99ae4cb3bfcfc7bc7f83c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 12:37:10 -0400 Subject: [PATCH 09/17] autocast --- pytorch_lightning/trainer/distrib_parts.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 5863fccbb5e5e..f6613ce1ac4ba 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -530,6 +530,10 @@ def dp_train(self, model): model.cuda(self.root_gpu) + # hack forward to do autocast for the user + if self.use_amp and self.use_native_amp: + model.forward = torch.cuda.amp.autocast()(model.forward) + # TODO: remove in v0.8.0 # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 From 199c96c9b57ede4c8488bc5bb9cd220debad6f23 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 12:42:02 -0400 Subject: [PATCH 10/17] autocast --- pytorch_lightning/trainer/distrib_parts.py | 4 ++++ pytorch_lightning/trainer/trainer.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index f6613ce1ac4ba..7b79922d82a00 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -531,7 +531,9 @@ def dp_train(self, model): model.cuda(self.root_gpu) # hack forward to do autocast for the user + model_autocast_original_forward = model.forward if self.use_amp and self.use_native_amp: + # wrap the user's forward in autocast and give it back at the end model.forward = torch.cuda.amp.autocast()(model.forward) # TODO: remove in v0.8.0 @@ -558,6 +560,8 @@ def dp_train(self, model): self.run_pretrain_routine(model) + model.forward = model_autocast_original_forward + def horovod_train(self, model): # Horovod: initialize library hvd.init() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e8b7d5b14a675..e92f3c092631e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -489,6 +489,8 @@ def __init__( # AMP init # These are the only lines needed after v0.8.0 + # we wrap the user's forward with autocast and give it back at the end of fit + self.autocast_original_forward = None self.use_native_amp = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") if self.use_native_amp and self.precision == 16: self.scaler = torch.cuda.amp.GradScaler() From 2af9dc9ee1aa9c40038536c408389dc565d6af26 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 19:48:59 -0400 Subject: [PATCH 11/17] removed comments --- pytorch_lightning/core/hooks.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 8264241aab1f4..1a3f05be11c50 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -138,8 +138,6 @@ def backward(self, use_amp, loss, optimizer): else: loss.backward() - .. note:: with PyTorch 1.6+ + precision=16 + multiple optimizers, set .backward(retrain_graph=True) - """ if trainer.precision == 16: # .backward is not special on 16-bit with TPUs @@ -147,7 +145,6 @@ def backward(self, use_amp, loss, optimizer): return if self.trainer.use_native_amp: - # don't forget to retain graph on backward with multiple optimizers self.trainer.scaler.scale(loss).backward() # TODO: remove in v0.8.0 From afb6801a3229154b1e3de6b1b70fe33015bff973 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 19:55:24 -0400 Subject: [PATCH 12/17] removed comments --- pytorch_lightning/core/lightning.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 632fe811f601d..ace22fd75cb0e 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1157,6 +1157,12 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, if self.trainer.use_tpu and XLA_AVAILABLE: xm.optimizer_step(optimizer) elif isinstance(optimizer, torch.optim.LBFGS): + + # native amp + lbfgs is a no go right now + if self.use_amp and self.use_native_amp: + m = 'native PyTorch amp and lbfgs are not compatible. To request, please file' \ + 'a Github issue in PyTorch and tag @mcarilli' + raise MisconfigurationException(m) optimizer.step(second_order_closure) else: if self.use_amp and self.use_native_amp: From fa87d1da769ee7160984e0b905cc8b22d8647656 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 20:09:41 -0400 Subject: [PATCH 13/17] added state saving --- pytorch_lightning/trainer/training_io.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 47448132df28a..09a65af0485de 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -281,6 +281,10 @@ def restore(self, checkpoint_path: str, on_gpu: bool): if on_gpu: model.cuda(self.root_gpu) + # restore amp scaling + if self.use_amp and self.use_native_amp and 'native_amp_scaling_state' in checkpoint: + self.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) + # load training state (affects trainer only) self.restore_training_state(checkpoint) @@ -316,6 +320,10 @@ def dump_checkpoint(self): checkpoint['state_dict'] = model.state_dict() + # restore native amp scaling + if self.use_amp and self.use_native_amp and 'native_amp_scaling_state' in checkpoint: + checkpoint['native_amp_scaling_state'] = self.scaler.state_dict + if hasattr(model, "hparams"): is_namespace = isinstance(model.hparams, Namespace) checkpoint['hparams'] = vars(model.hparams) if is_namespace else model.hparams @@ -441,6 +449,10 @@ def hpc_load(self, folderpath, on_gpu): # load the state_dict on the model automatically model.load_state_dict(checkpoint['state_dict']) + # restore amp scaling + if self.use_amp and self.use_native_amp and 'native_amp_scaling_state' in checkpoint: + self.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) + if self.root_gpu is not None: model.cuda(self.root_gpu) From c60a885bf1a4b16f0559d40f2ae87901fe4c3c37 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 22 Apr 2020 21:12:18 -0400 Subject: [PATCH 14/17] added state saving --- pytorch_lightning/trainer/training_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py index 09a65af0485de..0e9d00c6c4019 100644 --- a/pytorch_lightning/trainer/training_io.py +++ b/pytorch_lightning/trainer/training_io.py @@ -322,7 +322,7 @@ def dump_checkpoint(self): # restore native amp scaling if self.use_amp and self.use_native_amp and 'native_amp_scaling_state' in checkpoint: - checkpoint['native_amp_scaling_state'] = self.scaler.state_dict + checkpoint['native_amp_scaling_state'] = self.scaler.state_dict() if hasattr(model, "hparams"): is_namespace = isinstance(model.hparams, Namespace) From de22e4f23807ce65629c8201209fa2a2e6122682 Mon Sep 17 00:00:00 2001 From: "J. Borovec" Date: Thu, 23 Apr 2020 19:37:04 +0200 Subject: [PATCH 15/17] try install amp again --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index 23f520c0b77e3..db40ca68e26eb 100644 --- a/.drone.yml +++ b/.drone.yml @@ -31,7 +31,7 @@ steps: - pip install pip -U - pip --version - nvidia-smi - # - bash ./tests/install_AMP.sh + - bash ./tests/install_AMP.sh - apt-get update && apt-get install -y cmake - pip install -r requirements.txt --user -q - pip install -r ./tests/requirements-devel.txt --user -q From 879e691cd529be25d552a8ad3ddc907751f2fc95 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Thu, 23 Apr 2020 13:50:31 -0400 Subject: [PATCH 16/17] added state saving --- pytorch_lightning/trainer/auto_mix_precision.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/auto_mix_precision.py b/pytorch_lightning/trainer/auto_mix_precision.py index 8d63ec9f4a71f..2551b8a22dd0f 100644 --- a/pytorch_lightning/trainer/auto_mix_precision.py +++ b/pytorch_lightning/trainer/auto_mix_precision.py @@ -53,9 +53,4 @@ def init_amp(self, use_amp): @property def use_amp(self) -> bool: - if self.use_native_amp: - return self.precision == 16 - - # TODO: remove in v0.8.0 - else: - return self.precision == 16 and APEX_AVAILABLE + return self.precision == 16 From 60b9963aa6ae52d80fbc9a9c9242d0f7123b9f8f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 23 Apr 2020 20:15:30 +0200 Subject: [PATCH 17/17] drop Apex reinstall --- .drone.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.drone.yml b/.drone.yml index db40ca68e26eb..407ebd066cf9b 100644 --- a/.drone.yml +++ b/.drone.yml @@ -31,7 +31,7 @@ steps: - pip install pip -U - pip --version - nvidia-smi - - bash ./tests/install_AMP.sh + #- bash ./tests/install_AMP.sh - apt-get update && apt-get install -y cmake - pip install -r requirements.txt --user -q - pip install -r ./tests/requirements-devel.txt --user -q