diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index ea1cc8afdbc8c..eab512fc4d5b7 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -39,7 +39,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1" options: "--gpus=all --shm-size=32g" workspace: clean: all @@ -49,18 +49,41 @@ jobs: - bash: | echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver" echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" displayName: 'set env. vars' - bash: | - pip install -e .[strategies] --find-links ${TORCH_URL} + echo $CUDA_VISIBLE_DEVICES + echo $TORCH_URL + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version pip list + displayName: 'Image info & NVIDIA' + + - bash: | + python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt] + + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION} + displayName: 'Adjust dependencies' + + - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL} env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: "pytorch" + FREEZE_REQUIREMENTS: "1" displayName: 'Install package' + - bash: | + set -e + pip list + python requirements/collect_env_details.py + python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" + displayName: 'Env details' + - bash: python -m pytest benchmarks -v --durations=0 env: PL_RUNNING_BENCHMARKS: "1" diff --git a/.azure/gpu-tests-lite.yml b/.azure/gpu-tests-lite.yml index 894afc637aa66..8d774e69df555 100644 --- a/.azure/gpu-tests-lite.yml +++ b/.azure/gpu-tests-lite.yml @@ -41,7 +41,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--gpus=all --shm-size=2gb" @@ -51,6 +51,14 @@ jobs: steps: - bash: | + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" + displayName: 'set env. vars' + + - bash: | + echo $CUDA_VISIBLE_DEVICES + echo $TORCH_URL lspci | egrep 'VGA|3D' whereis nvidia nvidia-smi @@ -61,14 +69,13 @@ jobs: displayName: 'Image info & NVIDIA' - bash: | - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: 'set visible devices' + PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])") + python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION} + python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION} + displayName: 'Adjust dependencies' - bash: | - set -e - CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - pip install -e .[dev,strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html - pip list + pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL} env: PACKAGE_NAME: "lite" FREEZE_REQUIREMENTS: "1" @@ -76,7 +83,7 @@ jobs: - bash: | set -e - echo $CUDA_VISIBLE_DEVICES + pip list python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" displayName: 'Env details' diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 05571269a99a7..53040d0857345 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -42,9 +42,12 @@ jobs: - job: testing strategy: matrix: - # TODO: package parametrization - 'PyTorch - stable': + 'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" + scope: "strategies" + 'PyTorch - latest': + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1" + scope: "" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -93,11 +96,11 @@ jobs: python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION} displayName: 'Adjust dependencies' - - bash: pip install -e .[strategies] -r requirements/pytorch/devel.txt -r requirements/pytorch/examples.txt --find-links ${TORCH_URL} + - bash: pip install -e .[dev,examples] --find-links ${TORCH_URL} env: PACKAGE_NAME: "pytorch" FREEZE_REQUIREMENTS: "1" - displayName: 'Install package' + displayName: 'Install package & extras' - bash: | set -e @@ -109,14 +112,17 @@ jobs: CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org - pip list - displayName: 'Install dependencies' + pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL} + + python requirements/pytorch/check-avail-strategies.py + condition: eq(variables['scope'], 'strategies') + displayName: 'Install strategies' - bash: | set -e + pip list python requirements/collect_env_details.py python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" - python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index f7f0c2b1b5fc7..abe9f6ab69192 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -141,12 +141,14 @@ subprojects: - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" - "build-cuda (3.9, 1.12, 11.6.1)" + - "build-cuda (3.9, 1.13, 11.6.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - "build-pl (3.9, 1.10, 11.3.1)" - "build-pl (3.9, 1.11, 11.3.1)" - "build-pl (3.9, 1.12, 11.6.1)" + # TODO: add 1.13 - "build-xla (3.7, 1.12)" # SECTION: lightning_lite diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 220aa480f1302..c4d9e8dba5b48 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -37,6 +37,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -98,6 +99,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index eccbf608491ae..33102fd3e6705 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -19,6 +19,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} + - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"} steps: - name: Checkout uses: actions/checkout@v3 @@ -47,7 +48,7 @@ jobs: - name: Publish Latest to Docker uses: docker/build-push-action@v3 # Only latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.13' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 9a2e0455ff40f..3aea1ca0a43b6 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,12 +13,13 @@ # limitations under the License. ARG UBUNTU_VERSION=20.04 -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION=11.6.1 + FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.12 +ARG PYTORCH_VERSION=1.13 SHELL ["/bin/bash", "-c"] # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ @@ -35,7 +36,12 @@ ENV \ RUN \ # TODO: Remove the manual key installation once the base image is updated. # https://github.com/NVIDIA/nvidia-docker/issues/1631 - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + # https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214 + apt-get update && apt-get install -y wget && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ + mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \ + echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \ + apt-get update && \ apt-get update -qq --fix-missing && \ NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ @@ -132,16 +138,20 @@ RUN \ RUN \ # install Bagua - CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \ - if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ - python -c "import bagua; print(bagua.__version__)" + if [[ $PYTORCH_VERSION != "1.13" ]]; then \ + CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") ; \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") ; \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA" ; \ + if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then \ + python -c "import bagua_core; bagua_core.install_deps()"; \ + fi ; \ + python -c "import bagua; print(bagua.__version__)"; \ + fi RUN \ # install ColossalAI - SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \ - if [[ "$SHOULD_INSTALL_COLOSSAL" = "1" ]]; then \ + # TODO: 1.13 wheels are not released, remove skip once they are + if [[ $PYTORCH_VERSION != "1.13" ]]; then \ PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \ CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \ CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \ @@ -152,11 +162,8 @@ RUN \ RUN \ # install rest of strategies # remove colossalai from requirements since they are installed separately - SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \ - if [[ "$SHOULD_INSTALL_COLOSSAL" = "0" ]]; then \ - python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \ - fi && \ - echo "$SHOULD_INSTALL_COLOSSAL" && \ + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \ + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" ; \ cat requirements/pytorch/strategies.txt && \ pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html @@ -170,5 +177,4 @@ RUN \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ python requirements/pytorch/check-avail-extras.py && \ - python requirements/pytorch/check-avail-strategies.py && \ rm -rf requirements/ diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 37a75ba9f45bd..2f0cce54f4158 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -5,5 +5,5 @@ colossalai>=0.1.10 fairscale>=0.4.5, <=0.4.6 deepspeed>=0.6.0, <=0.7.0 # no need to install with [pytorch] as pytorch is already installed -horovod>=0.21.2, !=0.24.0, <0.25.1 +horovod>=0.21.2, !=0.24.0, <=0.26.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/tests/tests_lite/conftest.py b/tests/tests_lite/conftest.py index 1455709e5c82f..af023504b5473 100644 --- a/tests/tests_lite/conftest.py +++ b/tests/tests_lite/conftest.py @@ -54,6 +54,7 @@ def restore_env_variables(): "HOROVOD_FUSION_THRESHOLD", "RANK", # set by DeepSpeed "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy + "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13 } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index 5d8616ad00657..2f5607828a232 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -72,6 +72,9 @@ def restore_env_variables(): "HOROVOD_FUSION_THRESHOLD", "RANK", # set by DeepSpeed "POPLAR_ENGINE_OPTIONS", # set by IPUStrategy + "CUDA_MODULE_LOADING", # leaked since PyTorch 1.13 + "KMP_INIT_AT_FORK", # leaked since PyTorch 1.13 + "KMP_DUPLICATE_LIB_OK", # leaked since PyTorch 1.13 } leaked_vars.difference_update(allowlist) assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}" diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index be8bced2cbf5f..97e3d27760ea8 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -101,9 +101,6 @@ def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> Non def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer, FullyShardedDataParallel) assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) - # root should not be resharding - assert self.layer.reshard_after_forward is False - precision = torch.float16 if self.precision == 16 else torch.bfloat16 assert self.layer.mixed_precision.param_dtype == precision assert self.layer.mixed_precision.reduce_dtype == precision @@ -111,9 +108,6 @@ def _assert_layer_fsdp_instance(self) -> None: for layer_num in [0, 2]: assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel) - # Assert that the nested layers are set reshard_after_forward to True - assert self.layer.module[layer_num].reshard_after_forward is True - assert self.layer[layer_num].mixed_precision.param_dtype == precision assert self.layer[layer_num].mixed_precision.reduce_dtype == precision assert self.layer[layer_num].mixed_precision.buffer_dtype == precision @@ -146,9 +140,6 @@ def _assert_layer_fsdp_instance(self) -> None: precision = torch.float16 if self.precision == 16 else torch.bfloat16 for layer_num in [0, 2]: assert isinstance(self.layer[layer_num], FullyShardedDataParallel) - # Assert that the nested layers are set reshard_after_forward to True - assert self.layer[layer_num].reshard_after_forward - assert self.layer[layer_num].mixed_precision.param_dtype == precision assert self.layer[layer_num].mixed_precision.reduce_dtype == precision assert self.layer[layer_num].mixed_precision.buffer_dtype == precision