Merge branch 'master' into master

Lightning-AI · Jan 12, 2023 · 77043b0 · 77043b0
2 parents c44a892 + 0876a64
commit 77043b0
Show file tree

Hide file tree

Showing 48 changed files with 174 additions and 656 deletions.
diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml
@@ -176,7 +176,7 @@ jobs:
         #LAI_PASS: $(LAI_PASS)  # for STAGING
         LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
         LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
-        LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
+        LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD)
         LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
       displayName: 'Run the tests'
 
@@ -196,7 +196,7 @@ jobs:
         #LAI_PASS: $(LAI_PASS)  # for STAGING
         LIGHTNING_USER_ID: $(LIGHTNING_USER_ID_PROD)
         LIGHTNING_API_KEY: $(LIGHTNING_API_KEY_PROD)
-        LIGHTNING_USERNAME: $(LIGHTNING_USERNAME)
+        LIGHTNING_USERNAME: $(LIGHTNING_USERNAME_PROD)
         LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL_PROD)
       timeoutInMinutes: "3"
       displayName: 'Clean Previous Apps'
diff --git a/docs/source-pytorch/ecosystem/transformers.rst b/docs/source-pytorch/ecosystem/transformers.rst
diff --git a/docs/source-pytorch/fabric/api/api_reference.rst b/docs/source-pytorch/fabric/api/api_reference.rst
@@ -124,7 +124,6 @@ Strategies
     Strategy
     DDPStrategy
     DataParallelStrategy
-    DDPShardedStrategy
     FSDPStrategy
     ParallelStrategy
     SingleDeviceStrategy

diff --git a/docs/source-pytorch/fabric/api/fabric_args.rst b/docs/source-pytorch/fabric/api/fabric_args.rst
@@ -8,7 +8,7 @@ Fabric Arguments
 accelerator
 ===========
 
-Choose one of ``"cpu"``, ``"gpu"``, ``"tpu"``, ``"auto"`` (IPU support is coming soon).
+Choose one of ``"cpu"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
 
 .. code-block:: python
 
@@ -35,7 +35,7 @@ The ``"auto"`` option recognizes the machine you are on and selects the availabl
 strategy
 ========
 
-Choose a training strategy: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"tpu_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``, or ``"ddp_sharded_spawn"``.
+Choose a training strategy: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"xla"``, ``"deepspeed"``, ``"fsdp"````.
 
 .. code-block:: python
 
@@ -55,9 +55,6 @@ Additionally, you can pass in your custom strategy by configuring additional par
     fabric = Fabric(strategy=DeepSpeedStrategy(stage=2), accelerator="gpu", devices=2)
 
 
-Support for Fully Sharded training strategies are coming soon.
-
-
 devices
 =======
 

diff --git a/requirements/app/components.txt b/requirements/app/components.txt
@@ -1,5 +1,5 @@
 # deps required by components in the lightning app repository (src/lightning_app/components)
 lightning_api_access>=0.0.3  # serve
 aiohttp>=3.8.0, <=3.8.3  # auto_scaler
-# lightning_fabric  # multinode  # uncomment when released. it's okay to comment for now because pl includes it
-pytorch_lightning  # multinode
+# lightning-fabric>=1.9.0  # multinode  # uncomment when released. it's okay to comment for now because pl includes it
+pytorch-lightning>1.8.0  # multinode
diff --git a/requirements/fabric/strategies.txt b/requirements/fabric/strategies.txt
@@ -1,5 +1,3 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
-
-fairscale>=0.4.5, <0.4.13
 deepspeed>=0.6.0, <=0.7.0
diff --git a/src/lightning_app/components/multi_node/lite.py b/src/lightning_app/components/multi_node/lite.py
@@ -40,7 +40,6 @@ def run(
             try:
                 pkg = importlib.import_module(pkg_name)
                 fabrics.append(pkg.Fabric)
-                strategies.append(pkg.strategies.DDPShardedStrategy)
                 strategies.append(pkg.strategies.DDPStrategy)
                 mps_accelerators.append(pkg.accelerators.MPSAccelerator)
             except (ImportError, ModuleNotFoundError):

diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md
@@ -65,7 +65,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
--
+- Removed support for FairScale's sharded training (`strategy='ddp_sharded'|'ddp_sharded_spawn'`). Use Fully-Sharded Data Parallel instead (`strategy='fsdp'`) ([#16329](https://github.com/Lightning-AI/lightning/pull/16329))
 
 ### Fixed
 

diff --git a/src/lightning_fabric/connector.py b/src/lightning_fabric/connector.py
@@ -43,8 +43,6 @@
 from lightning_fabric.plugins.precision.fsdp import FSDPPrecision
 from lightning_fabric.plugins.precision.precision import _PRECISION_INPUT, _PRECISION_INPUT_INT, _PRECISION_INPUT_STR
 from lightning_fabric.strategies import (
-    DDPShardedStrategy,
-    DDPStrategy,
     DeepSpeedStrategy,
     SingleDeviceStrategy,
     SingleTPUStrategy,
@@ -54,7 +52,7 @@
 )
 from lightning_fabric.strategies.ddp import _DDP_FORK_ALIASES
 from lightning_fabric.strategies.fsdp import _FSDP_ALIASES, FSDPStrategy
-from lightning_fabric.utilities import _StrategyType, rank_zero_info, rank_zero_warn
+from lightning_fabric.utilities import rank_zero_info, rank_zero_warn
 from lightning_fabric.utilities.device_parser import _determine_root_gpu_device
 from lightning_fabric.utilities.imports import _IS_INTERACTIVE
 
@@ -516,7 +514,7 @@ def _lazy_init_strategy(self) -> None:
             raise RuntimeError(
                 f"`Fabric(strategy={self._strategy_flag!r})` is not compatible with an interactive"
                 " environment. Run your code as a script, or choose one of the compatible strategies:"
-                f" Fabric(strategy=None|{'|'.join(_StrategyType.interactive_compatible_types())})."
+                f" `Fabric(strategy=None|'dp'|'ddp_notebook')`."
                 " In case you are spawning processes yourself, make sure to include the Fabric"
                 " creation inside the worker function."
             )
@@ -547,21 +545,3 @@ def _argument_from_env(name: str, current: Any, default: Any) -> Any:
         if env_value is None:
             return current
         return env_value
-
-    @property
-    def is_distributed(self) -> bool:
-        # TODO: deprecate this property
-        # Used for custom plugins.
-        # Custom plugins should implement is_distributed property.
-        if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator):
-            return self.strategy.is_distributed
-        distributed_strategy = (
-            DDPStrategy,
-            DDPShardedStrategy,
-            DeepSpeedStrategy,
-            XLAStrategy,
-        )
-        is_distributed = isinstance(self.strategy, distributed_strategy)
-        if isinstance(self.accelerator, TPUAccelerator):
-            is_distributed |= self.strategy.is_distributed
-        return is_distributed
diff --git a/src/lightning_fabric/fabric.py b/src/lightning_fabric/fabric.py
@@ -16,7 +16,7 @@
 from contextlib import contextmanager, nullcontext
 from functools import partial
 from pathlib import Path
-from typing import Any, Callable, cast, Dict, Generator, List, Optional, overload, Sequence, Tuple, Union
+from typing import Any, Callable, cast, Dict, Generator, List, Mapping, Optional, overload, Sequence, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -32,14 +32,7 @@
 from lightning_fabric.plugins import Precision  # avoid circular imports: # isort: split
 from lightning_fabric.accelerators.accelerator import Accelerator
 from lightning_fabric.connector import _Connector, _PLUGIN_INPUT, _PRECISION_INPUT
-from lightning_fabric.strategies import (
-    DDPShardedStrategy,
-    DeepSpeedStrategy,
-    FSDPStrategy,
-    SingleDeviceStrategy,
-    Strategy,
-    XLAStrategy,
-)
+from lightning_fabric.strategies import DeepSpeedStrategy, FSDPStrategy, SingleDeviceStrategy, Strategy, XLAStrategy
 from lightning_fabric.strategies.strategy import _Sharded, TBroadcast
 from lightning_fabric.utilities import move_data_to_device
 from lightning_fabric.utilities.apply_func import convert_tensors_to_scalars, convert_to_tensors
@@ -69,7 +62,7 @@ class Fabric:
         accelerator: The hardware to run on. Possible choices are:
             ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``.
         strategy: Strategy for how to run across multiple devices. Possible choices are:
-            ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``.
+            ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"fsdp"``.
         devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``.
             The value applies per node.
         num_nodes: Number of GPU nodes for distributed training.
@@ -604,7 +597,7 @@ def log(self, name: str, value: Any, step: Optional[int] = None) -> None:
         """
         self.log_dict(metrics={name: value}, step=step)
 
-    def log_dict(self, metrics: Dict[str, Any], step: Optional[int] = None) -> None:
+    def log_dict(self, metrics: Mapping[str, Any], step: Optional[int] = None) -> None:
         """Log multiple scalars at once to all loggers that were added to Fabric.
 
         Args:
@@ -673,7 +666,7 @@ def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) -
 
     def _requires_distributed_sampler(self, dataloader: DataLoader) -> bool:
         return (
-            self._connector.is_distributed
+            getattr(self.strategy, "distributed_sampler_kwargs", None) is not None
             and not isinstance(dataloader.sampler, DistributedSampler)
             and not has_iterable_dataset(dataloader)
         )
@@ -713,15 +706,8 @@ def _validate_setup_module(self, module: nn.Module) -> None:
         if isinstance(module, _FabricModule):
             raise ValueError("A model should be passed only once to the `setup_module` method.")
 
-        if isinstance(self._strategy, DDPShardedStrategy):
-            raise RuntimeError(
-                f"The `{type(self._strategy).__name__}` requires the model and optimizer(s) to be set up jointly"
-                " through `.setup(model, optimizer, ...)`. For inference, choose a different strategy, for example"
-                " `ddp`."
-            )
-
     def _validate_setup_optimizers(self, optimizers: Sequence[Optimizer]) -> None:
-        if isinstance(self._strategy, (DeepSpeedStrategy, DDPShardedStrategy, XLAStrategy)):
+        if isinstance(self._strategy, (DeepSpeedStrategy, XLAStrategy)):
             raise RuntimeError(
                 f"The `{type(self._strategy).__name__}` requires the model and optimizer(s) to be set up jointly"
                 " through `.setup(model, optimizer, ...)`."

diff --git a/src/lightning_fabric/strategies/__init__.py b/src/lightning_fabric/strategies/__init__.py
@@ -14,7 +14,6 @@
 from lightning_fabric.strategies.ddp import DDPStrategy  # noqa: F401
 from lightning_fabric.strategies.deepspeed import DeepSpeedStrategy  # noqa: F401
 from lightning_fabric.strategies.dp import DataParallelStrategy  # noqa: F401
-from lightning_fabric.strategies.fairscale import DDPShardedStrategy  # noqa: F401
 from lightning_fabric.strategies.fsdp import FSDPStrategy  # noqa: F401
 from lightning_fabric.strategies.parallel import ParallelStrategy  # noqa: F401
 from lightning_fabric.strategies.registry import _call_register_strategies, _StrategyRegistry

diff --git a/src/lightning_fabric/strategies/ddp.py b/src/lightning_fabric/strategies/ddp.py
@@ -81,10 +81,6 @@ def root_device(self) -> torch.device:
         assert self.parallel_devices is not None
         return self.parallel_devices[self.local_rank]
 
-    @property
-    def is_distributed(self) -> bool:
-        return True
-
     @property
     def num_nodes(self) -> int:
         return self._num_nodes

diff --git a/src/lightning_fabric/strategies/dp.py b/src/lightning_fabric/strategies/dp.py
@@ -50,6 +50,10 @@ def root_device(self) -> torch.device:
         assert self.parallel_devices is not None
         return self.parallel_devices[0]
 
+    @property
+    def distributed_sampler_kwargs(self) -> None:
+        return None
+
     def setup_module(self, module: Module) -> DataParallel:
         """Wraps the given model into a :class:`~torch.nn.parallel.DataParallel` module."""
         return DataParallel(module=module, device_ids=self.parallel_devices)