Upgrade GPU CI to PyTorch 1.13 (#15583)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> (cherry picked from commit e87c11a)
Lightning-AI · Nov 16, 2022 · 91d7a38 · 91d7a38
1 parent 067bb4e
commit 91d7a38
Show file tree

Hide file tree

Showing 11 changed files with 89 additions and 47 deletions.
diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml
@@ -39,7 +39,7 @@ jobs:
     variables:
       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
     container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
       options: "--gpus=all --shm-size=32g"
     workspace:
       clean: all
@@ -49,18 +49,41 @@ jobs:
     - bash: |
         echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
         cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
-        echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
         echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
       displayName: 'set env. vars'
 
     - bash: |
-        pip install -e .[strategies] --find-links ${TORCH_URL}
+        echo $CUDA_VISIBLE_DEVICES
+        echo $TORCH_URL
+        lspci | egrep 'VGA|3D'
+        whereis nvidia
+        nvidia-smi
+        which python && which pip
+        python --version
+        pip --version
         pip list
+      displayName: 'Image info & NVIDIA'
+
+    - bash: |
+        python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt]
+
+        PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+        python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
+      displayName: 'Adjust dependencies'
+
+    - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
       env:
-        PACKAGE_NAME: pytorch
-        FREEZE_REQUIREMENTS: 1
+        PACKAGE_NAME: "pytorch"
+        FREEZE_REQUIREMENTS: "1"
       displayName: 'Install package'
 
+    - bash: |
+        set -e
+        pip list
+        python requirements/collect_env_details.py
+        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
+      displayName: 'Env details'
+
     - bash: python -m pytest benchmarks -v --durations=0
       env:
         PL_RUNNING_BENCHMARKS: "1"

diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml
@@ -41,7 +41,7 @@ jobs:
     variables:
       DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
     container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
       # default shm size is 64m. Increase it to avoid:
       # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
       options: "--gpus=all --shm-size=2gb"
@@ -51,6 +51,14 @@ jobs:
 
     steps:
     - bash: |
+        echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+        cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
+        echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
+      displayName: 'set env. vars'
+
+    - bash: |
+        echo $CUDA_VISIBLE_DEVICES
+        echo $TORCH_URL
         lspci | egrep 'VGA|3D'
         whereis nvidia
         nvidia-smi
@@ -61,22 +69,21 @@ jobs:
       displayName: 'Image info & NVIDIA'
 
     - bash: |
-        echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
-      displayName: 'set visible devices'
+        PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+        python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION}
+        python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION}
+      displayName: 'Adjust dependencies'
 
     - bash: |
-        set -e
-        CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
-        pip install -e .[dev,strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
-        pip list
+        pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
       env:
         PACKAGE_NAME: "lite"
         FREEZE_REQUIREMENTS: "1"
       displayName: 'Install package & dependencies'
 
     - bash: |
         set -e
-        echo $CUDA_VISIBLE_DEVICES
+        pip list
         python requirements/collect_env_details.py
         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
       displayName: 'Env details'

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -42,9 +42,12 @@ jobs:
   - job: testing
     strategy:
       matrix:
-        # TODO: package parametrization
-        'PyTorch - stable':
+        'PyTorch & strategies':  # this uses torch 1.12 as not all strategies support 1.13 yet
           image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
+          scope: "strategies"
+        'PyTorch - latest':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
+          scope: ""
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "80"
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -93,11 +96,11 @@ jobs:
         python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
       displayName: 'Adjust dependencies'
 
-    - bash: pip install -e .[strategies] -r requirements/pytorch/devel.txt -r requirements/pytorch/examples.txt --find-links ${TORCH_URL}
+    - bash: pip install -e .[dev,examples] --find-links ${TORCH_URL}
       env:
         PACKAGE_NAME: "pytorch"
         FREEZE_REQUIREMENTS: "1"
-      displayName: 'Install package'
+      displayName: 'Install package & extras'
 
     - bash: |
         set -e
@@ -109,14 +112,17 @@ jobs:
         CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])")
         pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org
 
-        pip list
-      displayName: 'Install dependencies'
+        pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
+
+        python requirements/pytorch/check-avail-strategies.py
+      condition: eq(variables['scope'], 'strategies')
+      displayName: 'Install strategies'
 
     - bash: |
         set -e
+        pip list
         python requirements/collect_env_details.py
         python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
-        python requirements/pytorch/check-avail-strategies.py
         python requirements/pytorch/check-avail-extras.py
       displayName: 'Env details'
 

diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -141,12 +141,14 @@ subprojects:
       - "build-cuda (3.9, 1.10, 11.3.1)"
       - "build-cuda (3.9, 1.11, 11.3.1)"
       - "build-cuda (3.9, 1.12, 11.6.1)"
+      - "build-cuda (3.9, 1.13, 11.6.1)"
       - "build-hpu (1.5.0, 1.11.0)"
       - "build-ipu (3.9, 1.9)"
       - "build-NGC"
       - "build-pl (3.9, 1.10, 11.3.1)"
       - "build-pl (3.9, 1.11, 11.3.1)"
       - "build-pl (3.9, 1.12, 11.6.1)"
+      # TODO: add 1.13
       - "build-xla (3.7, 1.12)"
 
   # SECTION: lightning_lite

diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml
@@ -37,6 +37,7 @@ jobs:
           - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
           - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
           - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
+          - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
     steps:
       - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
@@ -98,6 +99,7 @@ jobs:
           - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
           - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
           - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
+          - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
     steps:
       - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2

diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
@@ -19,6 +19,7 @@ jobs:
           - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
           - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
           - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
+          - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -47,7 +48,7 @@ jobs:
       - name: Publish Latest to Docker
         uses: docker/build-push-action@v3
         # Only latest Python and PyTorch
-        if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
+        if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.13'
         with:
           repository: pytorchlightning/pytorch_lightning
           username: ${{ secrets.DOCKER_USERNAME }}

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -13,12 +13,13 @@
 # limitations under the License.
 
 ARG UBUNTU_VERSION=20.04
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION=11.6.1
+
 
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
 ARG PYTHON_VERSION=3.9
-ARG PYTORCH_VERSION=1.12
+ARG PYTORCH_VERSION=1.13
 
 SHELL ["/bin/bash", "-c"]
 # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
@@ -35,7 +36,12 @@ ENV \
 RUN \
     # TODO: Remove the manual key installation once the base image is updated.
     # https://github.com/NVIDIA/nvidia-docker/issues/1631
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214
+    apt-get update && apt-get install -y wget && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \
+    echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \
+    apt-get update && \
     apt-get update -qq --fix-missing && \
     NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
     CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
@@ -132,16 +138,20 @@ RUN \
 
 RUN \
     # install Bagua
-    CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
-    CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
-    pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \
-    if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
-    python -c "import bagua; print(bagua.__version__)"
+    if [[ $PYTORCH_VERSION != "1.13" ]]; then \
+        CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") ; \
+        CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") ; \
+        pip install "bagua-cuda$CUDA_VERSION_BAGUA" ; \
+        if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then \
+          python -c "import bagua_core; bagua_core.install_deps()"; \
+        fi ; \
+        python -c "import bagua; print(bagua.__version__)"; \
+    fi
 
 RUN \
     # install ColossalAI
-    SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \
-    if [[ "$SHOULD_INSTALL_COLOSSAL" = "1" ]]; then \
+    # TODO: 1.13 wheels are not released, remove skip once they are
+    if [[ $PYTORCH_VERSION != "1.13" ]]; then \
         PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \
         CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
         CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
@@ -152,11 +162,8 @@ RUN \
 RUN \
     # install rest of strategies
     # remove colossalai from requirements since they are installed separately
-    SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \
-    if [[ "$SHOULD_INSTALL_COLOSSAL" = "0" ]]; then \
-        python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
-    fi && \
-    echo "$SHOULD_INSTALL_COLOSSAL" && \
+    python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
+    python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" ; \
     cat requirements/pytorch/strategies.txt && \
     pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
 
@@ -170,5 +177,4 @@ RUN \
     python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
     python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
     python requirements/pytorch/check-avail-extras.py && \
-    python requirements/pytorch/check-avail-strategies.py && \
     rm -rf requirements/
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
@@ -5,5 +5,5 @@ colossalai>=0.1.10
 fairscale>=0.4.5, <=0.4.6
 deepspeed>=0.6.0, <=0.7.0
 # no need to install with [pytorch] as pytorch is already installed
-horovod>=0.21.2, !=0.24.0, <0.25.1
+horovod>=0.21.2, !=0.24.0, <=0.26.1
 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'
diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py
@@ -54,6 +54,7 @@ def restore_env_variables():
         "HOROVOD_FUSION_THRESHOLD",
         "RANK",  # set by DeepSpeed
         "POPLAR_ENGINE_OPTIONS",  # set by IPUStrategy
+        "CUDA_MODULE_LOADING",  # leaked since PyTorch 1.13
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py
@@ -72,6 +72,9 @@ def restore_env_variables():
         "HOROVOD_FUSION_THRESHOLD",
         "RANK",  # set by DeepSpeed
         "POPLAR_ENGINE_OPTIONS",  # set by IPUStrategy
+        "CUDA_MODULE_LOADING",  # leaked since PyTorch 1.13
+        "KMP_INIT_AT_FORK",  # leaked since PyTorch 1.13
+        "KMP_DUPLICATE_LIB_OK",  # leaked since PyTorch 1.13
     }
     leaked_vars.difference_update(allowlist)
     assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"

diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py
@@ -101,19 +101,13 @@ def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> Non
     def _assert_layer_fsdp_instance(self) -> None:
         assert isinstance(self.layer, FullyShardedDataParallel)
         assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin)
-        # root should not be resharding
-        assert self.layer.reshard_after_forward is False
-
         precision = torch.float16 if self.precision == 16 else torch.bfloat16
         assert self.layer.mixed_precision.param_dtype == precision
         assert self.layer.mixed_precision.reduce_dtype == precision
         assert self.layer.mixed_precision.buffer_dtype == precision
 
         for layer_num in [0, 2]:
             assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel)
-            # Assert that the nested layers are set reshard_after_forward to True
-            assert self.layer.module[layer_num].reshard_after_forward is True
-
             assert self.layer[layer_num].mixed_precision.param_dtype == precision
             assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
             assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
@@ -146,9 +140,6 @@ def _assert_layer_fsdp_instance(self) -> None:
         precision = torch.float16 if self.precision == 16 else torch.bfloat16
         for layer_num in [0, 2]:
             assert isinstance(self.layer[layer_num], FullyShardedDataParallel)
-            # Assert that the nested layers are set reshard_after_forward to True
-            assert self.layer[layer_num].reshard_after_forward
-
             assert self.layer[layer_num].mixed_precision.param_dtype == precision
             assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
             assert self.layer[layer_num].mixed_precision.buffer_dtype == precision