mosaicml · b-chu · Oct 21, 2024 · Oct 17, 2024
@@ -20,11 +20,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.11-2.2
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -35,8 +30,13 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: cpu-3.11-2.4-composer
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+        - name: cpu-3.11-2.5
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.11-2.5-composer
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
@@ -45,11 +45,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.11-2.2
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -60,13 +55,18 @@ jobs:
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: daily-cpu-3.11-2.4-composer
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+        - name: daily-cpu-3.11-2.5
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.11-2.5-composer
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: daily-cpu-doctest
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -107,12 +107,6 @@ jobs:
         include:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
-        - name: "gpu-3.11-2.2-1-gpu"
-          container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.3-1-gpu"
           container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -125,12 +119,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.11-2.2-2-gpu"
-          container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
+        - name: "gpu-3.11-2.5-1-gpu"
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
-          gpu_num: 2
+          gpu_num: 1
         - name: "gpu-3.11-2.3-2-gpu"
           container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -143,12 +137,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.11-2.2-4-gpu"
-          container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
+        - name: "gpu-3.11-2.5-2-gpu"
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
-          gpu_num: 4
+          gpu_num: 2
         - name: "gpu-3.11-2.3-4-gpu"
           container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -161,6 +155,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 4
+        - name: "gpu-3.11-2.5-4-gpu"
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 4
     steps:
     - name: Checkout code
       uses: actions/checkout@v3

@@ -16,10 +16,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.11-2.2
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
@@ -28,8 +24,12 @@ jobs:
           container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
+        - name: cpu-3.11-2.5
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and not remote and not gpu and not doctest
+          pytest_command: coverage run -m pytest
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
     steps:

@@ -1,6 +1,6 @@
 name: PR GPU tests
 on:
-  pull_request_target:
+  pull_request:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
 # or dev
@@ -15,8 +15,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.4-1
-          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.5-1
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -44,8 +44,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.4-2
-          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.5-2
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -74,8 +74,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.4-4
-          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.5-4
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml

diff --git a/composer/checkpoint/state_dict.py b/composer/checkpoint/state_dict.py
@@ -88,7 +88,7 @@ def get_model_state_dict(
             log.debug('Calling model.state_dict() for non-FSDP model...')
             model_state_dict = model.state_dict()
         if isinstance(model, DistributedDataParallel):
-            nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.')
+            nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.')  # type: ignore
 
     if include_keys is not None:
         model_state_dict = _extract_keys_from_state_dict(model_state_dict, include_keys)

@@ -916,7 +916,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
         peft_config.save_pretrained(str(output_folder))
 
     weights_state_dict = composer_state_dict['state']['model']
-    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
+    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')  # type: ignore
 
     # NOTE: This only works for default adapter name, not multiple adapters
     if peft_config is not None:

diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
@@ -106,7 +106,13 @@ def patch_pytorch():
     elif version.parse(torch.__version__) < version.parse('2.4.1'):
         # Monkey patch for torch < 2.4.1 ie torch == 2.4.0
 
-        # No monkeypatches!
+        # No monkeypatches besides unshard (below)!
+        pass
+
+    elif version.parse(torch.__version__) < version.parse('2.5.1'):
+        # Monkey patch for torch < 2.5.1 ie torch == 2.5.0
+
+        # No monkeypatches besides unshard (below)!
         pass
 
 
@@ -1046,3 +1052,52 @@ def unshard_with_sync(self):
             raise RuntimeError('CUDA out of memory encountered on a different rank')
         padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
         self._use_unsharded_flat_param(padded_unsharded_flat_param)
+
+if version.parse(torch.__version__) >= version.parse('2.5.0') and version.parse(
+        torch.__version__,
+) < version.parse('2.5.1'):
+
+    # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks
+    from torch.distributed.fsdp._flat_param import FlatParamHandle
+    original_unshard = FlatParamHandle.unshard
+
+    @no_type_check
+    def unshard_with_sync(self):
+        """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`.
+
+        This prevents deadlocks when some ranks OOM after the alloc call and others do not.
+        This is a patched method from pytorch, meant to be called when automicrobatching
+        turns on hooks in its search process for the optimal non-OOMing microbatch size.
+        This includes all-gathering the flat parameter
+        and switching to using the unsharded flat parameter. If the handle does
+        not need unsharding, then this only switches to using the unsharded
+        flat parameter. For ``NO_SHARD``, this is a no-op.
+        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
+        mixed precision, then the parameter is forced to full precision.
+        """
+        if not self.needs_unshard():
+            # Even when not needing an unshard, we should switch to using
+            # the unsharded flat parameter
+            unsharded_flat_param = (
+                self._get_padded_unsharded_flat_param()
+                if self.uses_sharded_strategy
+                else self.flat_param
+            )
+            self._use_unsharded_flat_param(unsharded_flat_param)
+            return
+        unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+
+        # Check if any other rank hit an OOM
+        found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
+
+        dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX')
+        found_cuda_oom = found_cuda_oom_tensor.item()
+        # Signal current rank is still in batch
+        all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
+
+        dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN')
+
+        if found_cuda_oom == 1:
+            raise RuntimeError('CUDA out of memory encountered on a different rank')
+        padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
+        self._use_unsharded_flat_param(padded_unsharded_flat_param)
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -2317,9 +2317,11 @@ def fit(
             self.state.max_duration = duration + self.state.timestamp.get(duration.unit)
 
         # Raise error if callig fit with SGD
-        if type(
-            self.state.optimizers[0],
-        ) == torch.optim.SGD and version.parse(torch.__version__) >= version.parse('2.4.0'):
+        if (
+            type(self.state.optimizers[0]) == torch.optim.SGD and
+            version.parse(torch.__version__) >= version.parse('2.4.0') and
+            version.parse(torch.__version__) < version.parse('2.5.0')
+        ):
             raise ValueError(
                 'PyTorch 2.4 breaks (distributed) checkpointing with SGD. '
                 'Please use a different optimizer, e.g. composer.optim.DecoupledSGDW, '

@@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                                                                                                                                                          |
 |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04`                 |
-| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.4.1             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04`             |
+| Ubuntu 20.04   | Base     | 2.5.0             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04`                 |
+| Ubuntu 20.04   | Base     | 2.5.0             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.5.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04`             |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04`                                                                                        |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws`                                                                                |
+| Ubuntu 20.04   | Base     | 2.4.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04`                                                                                            |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                                                                        |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                                                                |
 | Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`                                                                                            |
-| Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04`                                                                                        |
-| Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws`                                                                                |
-| Ubuntu 20.04   | Base     | 2.2.2             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04`                                                                                            |
 <!-- END_PYTORCH_BUILD_MATRIX -->
 
 **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws`