diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 9902cd46237815..683f71247af5f7 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -17,6 +17,7 @@ from pytorch_lightning.core.grads import GradInformation from pytorch_lightning.core.hooks import ModelHooks from pytorch_lightning.core.memory import ModelSummary +from pytorch_lightning.core.properties import DeviceDtypeModuleMixin from pytorch_lightning.core.saving import ModelIO, load_hparams_from_tags_csv, load_hparams_from_yaml from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -30,14 +31,11 @@ XLA_AVAILABLE = True -class LightningModule(ABC, GradInformation, ModelIO, ModelHooks): +class LightningModule(ABC, DeviceDtypeModuleMixin, GradInformation, ModelIO, ModelHooks): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - #: Current dtype - self.dtype = torch.FloatTensor - self.exp_save_path = None #: The current epoch @@ -73,8 +71,10 @@ def __init__(self, *args, **kwargs): self.hparams = None + #: Current dtype + self._dtype = torch.FloatTensor #: device reference - self.device = None + self._device = torch.device('cpu') def print(self, *args, **kwargs) -> None: r""" diff --git a/pytorch_lightning/core/properties.py b/pytorch_lightning/core/properties.py new file mode 100644 index 00000000000000..eb3faf54faf6ef --- /dev/null +++ b/pytorch_lightning/core/properties.py @@ -0,0 +1,156 @@ +from typing import Union, Optional + +import torch + + +class DeviceDtypeModuleMixin(torch.nn.Module): + _device: ... + _dtype: Union[str, torch.dtype] + + @property + def dtype(self) -> Union[str, torch.dtype]: + return self._dtype + + @dtype.setter + def dtype(self, new_dtype: Union[str, torch.dtype]): + # necessary to avoid infinite recursion + raise RuntimeError('Cannot set the dtype explicitly. Please use module.to(new_dtype).') + + @property + def device(self) -> Union[str, torch.device]: + return self._device + + @device.setter + def device(self, new_device: Union[str, torch.device]): + # Necessary to avoid infinite recursion + raise RuntimeError('Cannot set the device explicitly. Please use module.to(new_device).') + + def to(self, *args, **kwargs) -> torch.nn.Module: + """Moves and/or casts the parameters and buffers. + + This can be called as + .. function:: to(device=None, dtype=None, non_blocking=False) + .. function:: to(dtype, non_blocking=False) + .. function:: to(tensor, non_blocking=False) + Its signature is similar to :meth:`torch.Tensor.to`, but only accepts + floating point desired :attr:`dtype` s. In addition, this method will + only cast the floating point parameters and buffers to :attr:`dtype` + (if given). The integral parameters and buffers will be moved + :attr:`device`, if that is given, but with dtypes unchanged. When + :attr:`non_blocking` is set, it tries to convert/move asynchronously + with respect to the host if possible, e.g., moving CPU Tensors with + pinned memory to CUDA devices. + See below for examples. + + Note: + This method modifies the module in-place. + + Args: + device: the desired device of the parameters + and buffers in this module + dtype: the desired floating point type of + the floating point parameters and buffers in this module + tensor: Tensor whose dtype and device are the desired + dtype and device for all parameters and buffers in this module + + Returns: + Module: self + + Example:: + >>> class ExampleModule(DeviceDtypeModuleMixin): + ... def __init__(self, weight: torch.Tensor): + ... super().__init__() + ... self.register_buffer('weight', weight) + >>> _ = torch.manual_seed(0) + >>> module = ExampleModule(torch.rand(3, 4)) + >>> module.weight #doctest: +ELLIPSIS + tensor([[...]]) + >>> module.to(torch.double) + ExampleModule() + >>> module.weight #doctest: +ELLIPSIS + tensor([[...]], dtype=torch.float64) + >>> cpu = torch.device('cpu') + >>> module.to(cpu, dtype=torch.half, non_blocking=True) + ExampleModule() + >>> module.weight #doctest: +ELLIPSIS + tensor([[...]], dtype=torch.float16) + >>> module.to(cpu) + ExampleModule() + >>> module.weight #doctest: +ELLIPSIS + tensor([[...]], dtype=torch.float16) + """ + # there is diff nb vars in PT 1.5 + out = torch._C._nn._parse_to(*args, **kwargs) + device = out[0] + dtype = out[1] + if device is not None: + self._device = device + + if dtype is not None: + self._dtype = dtype + + return super().to(*args, **kwargs) + + def cuda(self, device: Optional[int] = None) -> torch.nn.Module: + """Moves all model parameters and buffers to the GPU. + This also makes associated parameters and buffers different objects. So + it should be called before constructing optimizer if the module will + live on GPU while being optimized. + + Arguments: + device: if specified, all parameters will be + copied to that device + + Returns: + Module: self + """ + + self._device = torch.device('cuda', index=device) + return super().cuda(device=device) + + def cpu(self) -> torch.nn.Module: + """Moves all model parameters and buffers to the CPU. + Returns: + Module: self + """ + self._device = torch.device('cpu') + return super().cpu() + + def type(self, dst_type: Union[str, torch.dtype]) -> torch.nn.Module: + """Casts all parameters and buffers to :attr:`dst_type`. + + Arguments: + dst_type (type or string): the desired type + + Returns: + Module: self + """ + self._dtype = dst_type + return super().type(dst_type=dst_type) + + def float(self) -> torch.nn.Module: + """Casts all floating point parameters and buffers to float datatype. + + Returns: + Module: self + """ + self._dtype = torch.float + return super().float() + + def double(self) -> torch.nn.Module: + """Casts all floating point parameters and buffers to ``double`` datatype. + + Returns: + Module: self + """ + self._dtype = torch.double + return super().double() + + def half(self) -> torch.nn.Module: + """Casts all floating point parameters and buffers to ``half`` datatype. + + Returns: + Module: self + """ + self._dtype = torch.half + return super().half() diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index bd97d5ca339b03..a7925ab4fdf641 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -344,7 +344,7 @@ def ddp_train(self, process_idx, model): # copy model to each gpu if self.on_gpu: self.root_gpu = process_idx - self.device = torch.device('cuda', self.root_gpu) + self._device = torch.device('cuda', self.root_gpu) torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 1bd235cebad804..842496402cf980 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -432,7 +432,7 @@ def copy_trainer_model_properties(self, model): m.use_tpu = self.use_tpu m.tpu_local_core_rank = self.tpu_local_core_rank m.tpu_global_core_rank = self.tpu_global_core_rank - m.device = self.device + m._device = self._device def transfer_batch_to_tpu(self, batch): return self.__transfer_data_to_device(batch, device='tpu') @@ -484,7 +484,7 @@ def __transfer_data_to_device(self, batch, device, gpu_id=None): def single_gpu_train(self, model): model.cuda(self.root_gpu) - self.device = torch.device('cuda', self.root_gpu) + self._device = torch.device('cuda', self.root_gpu) # CHOOSE OPTIMIZER # allow for lr schedulers as well @@ -501,7 +501,7 @@ def single_gpu_train(self, model): def tpu_train(self, tpu_core_idx, model): # put model on tpu model.to(xm.xla_device()) - self.device = xm.xla_device() + self._device = xm.xla_device() # get the appropriate tpu ranks self.tpu_local_core_rank = xm.get_local_ordinal() @@ -539,7 +539,7 @@ def dp_train(self, model): self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(model) model.cuda(self.root_gpu) - self.device = torch.device('cuda', self.root_gpu) + self._device = torch.device('cuda', self.root_gpu) # hack forward to do autocast for the user model_autocast_original_forward = model.forward @@ -579,7 +579,7 @@ def horovod_train(self, model): assert self.root_gpu == hvd.local_rank() torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) - self.device = torch.device('cuda', self.root_gpu) + self._device = torch.device('cuda', self.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.progress_bar_callback is not None: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d0b1aa27a53a82..9639eec8c69b8e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -473,7 +473,7 @@ def __init__( # distributed backend choice self.distributed_backend = distributed_backend self.set_distributed_mode(distributed_backend) - self.device = torch.device('cpu') + self._device = torch.device('cpu') # override dist backend when using tpus if self.on_tpu: diff --git a/tests/base/model_template.py b/tests/base/model_template.py index 41e6edc5e3618f..d530fa4a97b129 100644 --- a/tests/base/model_template.py +++ b/tests/base/model_template.py @@ -34,6 +34,8 @@ class EvalModelTemplate( ): """ This template houses all combinations of model configurations we want to test + + >>> model = EvalModelTemplate() """ def __init__(self, hparams: object = None) -> object: """Pass in parsed HyperOptArgumentParser to the model."""