Lightning-AI · ananthsub · Dec 23, 2021 · Dec 18, 2021 · Dec 18, 2021 · Dec 18, 2021
diff --git a/pyproject.toml b/pyproject.toml
@@ -93,7 +93,6 @@ module = [
     "pytorch_lightning.trainer.connectors.checkpoint_connector",
     "pytorch_lightning.trainer.connectors.data_connector",
     "pytorch_lightning.trainer.data_loading",
-    "pytorch_lightning.trainer.optimizers",
     "pytorch_lightning.trainer.supporters",
     "pytorch_lightning.trainer.trainer",
     "pytorch_lightning.tuner.batch_size_scaling",

@@ -24,7 +24,6 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -182,7 +181,7 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
                 anneal_strategy=self._annealing_strategy,
                 last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1,
             )
-            default_scheduler_cfg = _get_default_scheduler_config()
+            default_scheduler_cfg = pl.LightningModule._get_default_scheduler_config()
             assert default_scheduler_cfg["interval"] == "epoch" and default_scheduler_cfg["frequency"] == 1
             default_scheduler_cfg["scheduler"] = self._swa_scheduler
 

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -24,7 +24,7 @@
 from typing import Any, Callable, Dict, List, Mapping, Optional, overload, Tuple, Union
 
 import torch
-from torch import ScriptModule, Tensor
+from torch import optim, ScriptModule, Tensor
 from torch.nn import Module
 from torch.optim.optimizer import Optimizer
 from torchmetrics import Metric
@@ -34,7 +34,7 @@
 from pytorch_lightning.callbacks.progress import base as progress_base
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
 from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin, HyperparametersMixin
-from pytorch_lightning.core.optimizer import LightningOptimizer
+from pytorch_lightning.core.optimizer import _MockOptimizer, LightningOptimizer
 from pytorch_lightning.core.saving import ModelIO
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
 from pytorch_lightning.utilities import (
@@ -1969,3 +1969,183 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None:
 
         self._register_state_dict_hook(state_dict_hook)
         self._register_load_state_dict_pre_hook(pre_load_state_dict_hook, True)
+
+    def init_optimizers_and_lr_schedulers(self) -> Tuple[List, List, List]:
+        optim_conf = self.configure_optimizers()
+        if optim_conf is None:
+            rank_zero_warn(
+                "`LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer",
+            )
+            optim_conf = _MockOptimizer()
+
+        optimizers, lr_schedulers, optimizer_frequencies, monitor = self._configure_optimizers(optim_conf)
+        lr_schedulers = self._configure_schedulers(lr_schedulers, monitor, not self.automatic_optimization)
+        _validate_scheduler_optimizer(optimizers, lr_schedulers)
+        return optimizers, lr_schedulers, optimizer_frequencies
+
+    @staticmethod
+    def _configure_optimizers(
+        optim_conf: Union[Dict[str, Any], List, Optimizer, Tuple]
+    ) -> Tuple[List, List, List, Optional[str]]:
+        optimizers, lr_schedulers, optimizer_frequencies = [], [], []
+        monitor = None
+
+        # single output, single optimizer
+        if isinstance(optim_conf, Optimizer):
+            optimizers = [optim_conf]
+        # two lists, optimizer + lr schedulers
+        elif (
+            isinstance(optim_conf, (list, tuple))
+            and len(optim_conf) == 2
+            and isinstance(optim_conf[0], list)
+            and all(isinstance(opt, Optimizer) for opt in optim_conf[0])
+        ):
+            opt, sch = optim_conf
+            optimizers = opt
+            lr_schedulers = sch if isinstance(sch, list) else [sch]
+        # single dictionary
+        elif isinstance(optim_conf, dict):
+            _validate_optim_conf(optim_conf)
+            optimizers = [optim_conf["optimizer"]]
+            monitor = optim_conf.get("monitor", None)
+            lr_schedulers = [optim_conf["lr_scheduler"]] if "lr_scheduler" in optim_conf else []
+        # multiple dictionaries
+        elif isinstance(optim_conf, (list, tuple)) and all(isinstance(d, dict) for d in optim_conf):
+            for opt_dict in optim_conf:
+                _validate_optim_conf(opt_dict)
+            optimizers = [opt_dict["optimizer"] for opt_dict in optim_conf]
+            scheduler_dict = (
+                lambda scheduler, opt_idx: dict(scheduler, opt_idx=opt_idx)
+                if isinstance(scheduler, dict)
+                else {"scheduler": scheduler, "opt_idx": opt_idx}
+            )
+
+            lr_schedulers = [
+                scheduler_dict(opt_dict["lr_scheduler"], opt_idx)
+                for opt_idx, opt_dict in enumerate(optim_conf)
+                if "lr_scheduler" in opt_dict
+            ]
+            optimizer_frequencies = [
+                opt_dict["frequency"] for opt_dict in optim_conf if opt_dict.get("frequency", None) is not None
+            ]
+            # assert that if frequencies are present, they are given for all optimizers
+            if optimizer_frequencies and len(optimizer_frequencies) != len(optimizers):
+                raise ValueError("A frequency must be given to each optimizer.")
+        # single list or tuple, multiple optimizer
+        elif isinstance(optim_conf, (list, tuple)) and all(isinstance(opt, Optimizer) for opt in optim_conf):
+            optimizers = list(optim_conf)
+        # unknown configuration
+        else:
+            raise MisconfigurationException(
+                "Unknown configuration for model optimizers."
+                " Output from `model.configure_optimizers()` should either be:\n"
+                " * `torch.optim.Optimizer`\n"
+                " * [`torch.optim.Optimizer`]\n"
+                " * ([`torch.optim.Optimizer`], [`torch.optim.lr_scheduler`])\n"
+                ' * {"optimizer": `torch.optim.Optimizer`, (optional) "lr_scheduler": `torch.optim.lr_scheduler`}\n'
+                ' * A list of the previously described dict format, with an optional "frequency" key (int)'
+            )
+        return optimizers, lr_schedulers, optimizer_frequencies, monitor
+
+    @staticmethod
+    def _configure_schedulers(
+        schedulers: list, monitor: Optional[str], is_manual_optimization: bool
+    ) -> List[Dict[str, Any]]:
+        """Convert each scheduler into dict structure with relevant information."""
+        lr_schedulers = []
+        default_config = LightningModule._get_default_scheduler_config()
+        for scheduler in schedulers:
+            if is_manual_optimization:
+                if isinstance(scheduler, dict):
+                    invalid_keys = {"interval", "frequency", "reduce_on_plateau", "monitor", "strict"}
+                    keys_to_warn = [k for k in scheduler.keys() if k in invalid_keys]
+
+                    if keys_to_warn:
+                        rank_zero_warn(
+                            f"The lr scheduler dict contains the key(s) {keys_to_warn}, but the keys will be ignored."
+                            " You need to call `lr_scheduler.step()` manually in manual optimization.",
+                            category=RuntimeWarning,
+                        )
+
+                    scheduler = {key: scheduler[key] for key in scheduler if key not in invalid_keys}
+                    lr_schedulers.append({**default_config, **scheduler})
+                else:
+                    lr_schedulers.append({**default_config, "scheduler": scheduler})
+            else:
+                if isinstance(scheduler, dict):
+                    # check provided keys
+                    extra_keys = [k for k in scheduler.keys() if k not in default_config.keys()]
+                    if extra_keys:
+                        rank_zero_warn(
+                            f"Found unsupported keys in the lr scheduler dict: {extra_keys}", category=RuntimeWarning
+                        )
+                    if "scheduler" not in scheduler:
+                        raise MisconfigurationException(
+                            'The lr scheduler dict must have the key "scheduler" with its item being an lr scheduler'
+                        )
+                    if "interval" in scheduler and scheduler["interval"] not in ("step", "epoch"):
+                        raise MisconfigurationException(
+                            'The "interval" key in lr scheduler dict must be "step" or "epoch"'
+                            f' but is "{scheduler["interval"]}"'
+                        )
+                    scheduler["reduce_on_plateau"] = isinstance(
+                        scheduler["scheduler"], optim.lr_scheduler.ReduceLROnPlateau
+                    )
+                    if scheduler["reduce_on_plateau"] and scheduler.get("monitor", None) is None:
+                        raise MisconfigurationException(
+                            "The lr scheduler dict must include a monitor when a `ReduceLROnPlateau` scheduler is used."
+                            ' For example: {"optimizer": optimizer, "lr_scheduler":'
+                            ' {"scheduler": scheduler, "monitor": "your_loss"}}'
+                        )
+                    is_one_cycle = isinstance(scheduler["scheduler"], optim.lr_scheduler.OneCycleLR)
+                    if is_one_cycle and scheduler.get("interval", "epoch") == "epoch":
+                        rank_zero_warn(
+                            "A `OneCycleLR` scheduler is using 'interval': 'epoch'."
+                            " Are you sure you didn't mean 'interval': 'step'?",
+                            category=RuntimeWarning,
+                        )
+                    lr_schedulers.append({**default_config, **scheduler})
+                elif isinstance(scheduler, optim.lr_scheduler.ReduceLROnPlateau):
+                    if monitor is None:
+                        raise MisconfigurationException(
+                            "`configure_optimizers` must include a monitor when a `ReduceLROnPlateau`"
+                            " scheduler is used. For example:"
+                            ' {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "metric_to_track"}'
+                        )
+                    lr_schedulers.append(
+                        {**default_config, "scheduler": scheduler, "reduce_on_plateau": True, "monitor": monitor}
+                    )
+                elif isinstance(scheduler, optim.lr_scheduler._LRScheduler):
+                    lr_schedulers.append({**default_config, "scheduler": scheduler})
+                else:
+                    raise ValueError(f'The provided lr scheduler "{scheduler}" is invalid')
+        return lr_schedulers
+
+    @staticmethod
+    def _get_default_scheduler_config() -> Dict[str, Any]:
+        return {
+            "scheduler": None,
+            "name": None,  # no custom name
+            "interval": "epoch",  # after epoch is over
+            "frequency": 1,  # every epoch/batch
+            "reduce_on_plateau": False,  # most often not ReduceLROnPlateau scheduler
+            "monitor": None,  # value to monitor for ReduceLROnPlateau
+            "strict": True,  # enforce that the monitor exists for ReduceLROnPlateau
+            "opt_idx": None,  # necessary to store opt_idx when optimizer frequencies are specified
+        }
+
+
+def _validate_scheduler_optimizer(optimizers, lr_schedulers):
+    if any(sch["scheduler"].optimizer not in optimizers for sch in lr_schedulers):
+        raise MisconfigurationException(
+            "Some schedulers are attached with an optimizer that wasn't returned from `configure_optimizers`."
+        )
+
+
+def _validate_optim_conf(optim_conf: Dict[str, Any]) -> None:
+    valid_keys = {"optimizer", "lr_scheduler", "frequency", "monitor"}
+    extra_keys = optim_conf.keys() - valid_keys
+    if extra_keys:
+        rank_zero_warn(
+            f"Found unsupported keys in the optimizer configuration: {set(extra_keys)}", category=RuntimeWarning
+        )
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import contextmanager
-from typing import Any, Callable, Generator, Optional
+from typing import Any, Callable, Dict, Generator, Optional
 from weakref import proxy
 
+import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
@@ -162,3 +163,30 @@ def closure_dis():
         assert trainer is not None
         with trainer.profiler.profile(profiler_action):
             trainer.training_type_plugin.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
+
+
+class _MockOptimizer(Optimizer):
+    """The `_MockOptimizer` will be used inplace of an optimizer in the event that `None` is returned from
+    `configure_optimizers`."""
+
+    def __init__(self) -> None:
+        super().__init__([torch.zeros(1)], {})
+
+    def add_param_group(self, param_group: Dict[Any, Any]) -> None:
+        pass  # Do Nothing
+
+    def load_state_dict(self, state_dict: Dict[Any, Any]) -> None:
+        pass  # Do Nothing
+
+    def state_dict(self) -> Dict[Any, Any]:
+        return {}  # Return Empty
+
+    def step(self, closure: Callable = None) -> None:
+        if closure is not None:
+            closure()
+
+    def zero_grad(self, set_to_none: Optional[bool] = False) -> None:
+        pass  # Do Nothing
+
+    def __repr__(self) -> str:
+        return "No Optimizer"
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -32,7 +32,6 @@
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.plugins.training_type.ddp import DDPPlugin
-from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -444,16 +443,14 @@ def init_deepspeed(self):
             self._initialize_deepspeed_inference(model)
 
     def _init_optimizers(self) -> Tuple[Optimizer, Optional[Union[LRSchedulerTypeTuple]], Optional[int]]:
-        optimizers, schedulers, optimizer_frequencies = self.lightning_module.trainer.init_optimizers(
-            self.lightning_module
-        )
+        optimizers, schedulers, optimizer_frequencies = self.lightning_module.init_optimizers_and_lr_schedulers()
         if len(optimizers) > 1 or len(schedulers) > 1:
             raise MisconfigurationException(
                 "DeepSpeed currently only supports single optimizer, single optional scheduler."
             )
         return (
             optimizers[0],
-            schedulers[0] if schedulers else _get_default_scheduler_config(),
+            schedulers[0] if schedulers else pl.LightningModule._get_default_scheduler_config(),
             optimizer_frequencies[0] if optimizer_frequencies else None,
         )
 
@@ -463,7 +460,7 @@ def zero_stage_3(self) -> bool:
 
     def _initialize_deepspeed_train(self, model):
         if "optimizer" in self.config:
-            optimizer, lr_scheduler = None, _get_default_scheduler_config()
+            optimizer, lr_scheduler = None, pl.LightningModule._get_default_scheduler_config()
         else:
             rank_zero_info(
                 "You have not specified an optimizer or scheduler within the DeepSpeed config."
@@ -562,12 +559,6 @@ def distributed_sampler_kwargs(self):
         distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank)
         return distributed_sampler_kwargs
 
-    def init_optimizers(self, trainer: "pl.Trainer", model: "pl.LightningModule") -> Tuple[List, List, List]:
-        # Skip initializing optimizers here as DeepSpeed handles optimizers via config.
-        # User may have specified config options instead in configure_optimizers, but this is handled
-        # via `_initialize_deepspeed_train`
-        return [], [], []  # empty optimizers, schedulers and frequencies
-
     @property
     def handles_gradient_accumulation(self) -> bool:
         """Whether the plugin handles gradient accumulation internally."""

diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -105,9 +105,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None:
         """
         if trainer.state.fn not in (TrainerFn.FITTING, TrainerFn.TUNING):
             return
-        optimizers, lr_schedulers, optimizer_frequencies = self.init_optimizers(
-            trainer=trainer, model=self.lightning_module
-        )
+        optimizers, lr_schedulers, optimizer_frequencies = self.lightning_module.init_optimizers_and_lr_schedulers()
         self.optimizers = optimizers
         self.lr_schedulers = lr_schedulers
         self.optimizer_frequencies = optimizer_frequencies
@@ -377,9 +375,6 @@ def process_dataloader(self, dataloader: DataLoader) -> DataLoader:
         """
         return dataloader
 
-    def init_optimizers(self, trainer: "pl.Trainer", model: "pl.LightningModule"):
-        return trainer.init_optimizers(model)
-
     @property
     def restore_checkpoint_after_pre_dispatch(self) -> bool:
         """Override to delay restoring from checkpoint till after pre-dispatch. This is useful when the plugin