Skip to content

Commit

Permalink
Upgrade GPU CI to PyTorch 1.13 (#15583)
Browse files Browse the repository at this point in the history
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: Jirka <jirka.borovec@seznam.cz>

(cherry picked from commit e87c11a)
  • Loading branch information
awaelchli authored and Borda committed Nov 16, 2022
1 parent 067bb4e commit 91d7a38
Show file tree
Hide file tree
Showing 11 changed files with 89 additions and 47 deletions.
33 changes: 28 additions & 5 deletions .azure/gpu-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
options: "--gpus=all --shm-size=32g"
workspace:
clean: all
Expand All @@ -49,18 +49,41 @@ jobs:
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
displayName: 'set env. vars'
- bash: |
pip install -e .[strategies] --find-links ${TORCH_URL}
echo $CUDA_VISIBLE_DEVICES
echo $TORCH_URL
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
which python && which pip
python --version
pip --version
pip list
displayName: 'Image info & NVIDIA'
- bash: |
python .actions/assistant.py requirements_prune_pkgs --packages [horovod,bagua,colossalai] --req_files [requirements/pytorch/strategies.txt]
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${PYTORCH_VERSION}
displayName: 'Adjust dependencies'
- bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: pytorch
FREEZE_REQUIREMENTS: 1
PACKAGE_NAME: "pytorch"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package'

- bash: |
set -e
pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
- bash: python -m pytest benchmarks -v --durations=0
env:
PL_RUNNING_BENCHMARKS: "1"
Expand Down
23 changes: 15 additions & 8 deletions .azure/gpu-tests-lite.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb"
Expand All @@ -51,6 +51,14 @@ jobs:

steps:
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
displayName: 'set env. vars'
- bash: |
echo $CUDA_VISIBLE_DEVICES
echo $TORCH_URL
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
Expand All @@ -61,22 +69,21 @@ jobs:
displayName: 'Image info & NVIDIA'
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
displayName: 'set visible devices'
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${PYTORCH_VERSION}
python ./requirements/pytorch/adjust-versions.py requirements/lite/examples.txt ${PYTORCH_VERSION}
displayName: 'Adjust dependencies'
- bash: |
set -e
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
pip install -e .[dev,strategies] --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
pip list
pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "lite"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package & dependencies'
- bash: |
set -e
echo $CUDA_VISIBLE_DEVICES
pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
Expand Down
20 changes: 13 additions & 7 deletions .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,12 @@ jobs:
- job: testing
strategy:
matrix:
# TODO: package parametrization
'PyTorch - stable':
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
scope: "strategies"
'PyTorch - latest':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.6.1"
scope: ""
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down Expand Up @@ -93,11 +96,11 @@ jobs:
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${PYTORCH_VERSION}
displayName: 'Adjust dependencies'
- bash: pip install -e .[strategies] -r requirements/pytorch/devel.txt -r requirements/pytorch/examples.txt --find-links ${TORCH_URL}
- bash: pip install -e .[dev,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "pytorch"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package'
displayName: 'Install package & extras'

- bash: |
set -e
Expand All @@ -109,14 +112,17 @@ jobs:
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])")
pip install "colossalai==0.1.10+torch${PYTORCH_VERSION_COLOSSALAI}cu${CUDA_VERSION_COLOSSALAI}" --find-links https://release.colossalai.org
pip list
displayName: 'Install dependencies'
pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
python requirements/pytorch/check-avail-strategies.py
condition: eq(variables['scope'], 'strategies')
displayName: 'Install strategies'
- bash: |
set -e
pip list
python requirements/collect_env_details.py
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
python requirements/pytorch/check-avail-strategies.py
python requirements/pytorch/check-avail-extras.py
displayName: 'Env details'
Expand Down
2 changes: 2 additions & 0 deletions .github/checkgroup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,14 @@ subprojects:
- "build-cuda (3.9, 1.10, 11.3.1)"
- "build-cuda (3.9, 1.11, 11.3.1)"
- "build-cuda (3.9, 1.12, 11.6.1)"
- "build-cuda (3.9, 1.13, 11.6.1)"
- "build-hpu (1.5.0, 1.11.0)"
- "build-ipu (3.9, 1.9)"
- "build-NGC"
- "build-pl (3.9, 1.10, 11.3.1)"
- "build-pl (3.9, 1.11, 11.3.1)"
- "build-pl (3.9, 1.12, 11.6.1)"
# TODO: add 1.13
- "build-xla (3.7, 1.12)"

# SECTION: lightning_lite
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-pytorch-dockers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
Expand Down Expand Up @@ -98,6 +99,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/release-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"}
- {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.6.1"}
steps:
- name: Checkout
uses: actions/checkout@v3
Expand Down Expand Up @@ -47,7 +48,7 @@ jobs:
- name: Publish Latest to Docker
uses: docker/build-push-action@v3
# Only latest Python and PyTorch
if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.13'
with:
repository: pytorchlightning/pytorch_lightning
username: ${{ secrets.DOCKER_USERNAME }}
Expand Down
38 changes: 22 additions & 16 deletions dockers/base-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
# limitations under the License.

ARG UBUNTU_VERSION=20.04
ARG CUDA_VERSION=11.3.1
ARG CUDA_VERSION=11.6.1


FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.12
ARG PYTORCH_VERSION=1.13

SHELL ["/bin/bash", "-c"]
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
Expand All @@ -35,7 +36,12 @@ ENV \
RUN \
# TODO: Remove the manual key installation once the base image is updated.
# https://github.com/NVIDIA/nvidia-docker/issues/1631
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
# https://github.com/NVIDIA/nvidia-docker/issues/1631#issuecomment-1264715214
apt-get update && apt-get install -y wget && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
mkdir -p /etc/apt/keyrings/ && mv 3bf863cc.pub /etc/apt/keyrings/ && \
echo "deb [signed-by=/etc/apt/keyrings/3bf863cc.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" /etc/apt/sources.list.d/cuda.list && \
apt-get update && \
apt-get update -qq --fix-missing && \
NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \
CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \
Expand Down Expand Up @@ -132,16 +138,20 @@ RUN \

RUN \
# install Bagua
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \
pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \
if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \
python -c "import bagua; print(bagua.__version__)"
if [[ $PYTORCH_VERSION != "1.13" ]]; then \
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") ; \
CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") ; \
pip install "bagua-cuda$CUDA_VERSION_BAGUA" ; \
if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then \
python -c "import bagua_core; bagua_core.install_deps()"; \
fi ; \
python -c "import bagua; print(bagua.__version__)"; \
fi

RUN \
# install ColossalAI
SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \
if [[ "$SHOULD_INSTALL_COLOSSAL" = "1" ]]; then \
# TODO: 1.13 wheels are not released, remove skip once they are
if [[ $PYTORCH_VERSION != "1.13" ]]; then \
PYTORCH_VERSION_COLOSSALAI=$(python -c "import torch; print(torch.__version__.split('+')[0][:4])") ; \
CUDA_VERSION_MM_COLOSSALAI=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda)))") ; \
CUDA_VERSION_COLOSSALAI=$(python -c "print([ver for ver in [11.3, 11.1] if $CUDA_VERSION_MM_COLOSSALAI >= ver][0])") ; \
Expand All @@ -152,11 +162,8 @@ RUN \
RUN \
# install rest of strategies
# remove colossalai from requirements since they are installed separately
SHOULD_INSTALL_COLOSSAL=$(python -c "import torch; print(1 if int(torch.__version__.split('.')[1]) > 9 else 0)") && \
if [[ "$SHOULD_INSTALL_COLOSSAL" = "0" ]]; then \
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
fi && \
echo "$SHOULD_INSTALL_COLOSSAL" && \
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" ; \
cat requirements/pytorch/strategies.txt && \
pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html

Expand All @@ -170,5 +177,4 @@ RUN \
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
python requirements/pytorch/check-avail-extras.py && \
python requirements/pytorch/check-avail-strategies.py && \
rm -rf requirements/
2 changes: 1 addition & 1 deletion requirements/pytorch/strategies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ colossalai>=0.1.10
fairscale>=0.4.5, <=0.4.6
deepspeed>=0.6.0, <=0.7.0
# no need to install with [pytorch] as pytorch is already installed
horovod>=0.21.2, !=0.24.0, <0.25.1
horovod>=0.21.2, !=0.24.0, <=0.26.1
hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'
1 change: 1 addition & 0 deletions tests/tests_lite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def restore_env_variables():
"HOROVOD_FUSION_THRESHOLD",
"RANK", # set by DeepSpeed
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
"CUDA_MODULE_LOADING", # leaked since PyTorch 1.13
}
leaked_vars.difference_update(allowlist)
assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
Expand Down
3 changes: 3 additions & 0 deletions tests/tests_pytorch/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def restore_env_variables():
"HOROVOD_FUSION_THRESHOLD",
"RANK", # set by DeepSpeed
"POPLAR_ENGINE_OPTIONS", # set by IPUStrategy
"CUDA_MODULE_LOADING", # leaked since PyTorch 1.13
"KMP_INIT_AT_FORK", # leaked since PyTorch 1.13
"KMP_DUPLICATE_LIB_OK", # leaked since PyTorch 1.13
}
leaked_vars.difference_update(allowlist)
assert not leaked_vars, f"test is leaking environment variable(s): {set(leaked_vars)}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,19 +101,13 @@ def on_predict_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> Non
def _assert_layer_fsdp_instance(self) -> None:
assert isinstance(self.layer, FullyShardedDataParallel)
assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin)
# root should not be resharding
assert self.layer.reshard_after_forward is False

precision = torch.float16 if self.precision == 16 else torch.bfloat16
assert self.layer.mixed_precision.param_dtype == precision
assert self.layer.mixed_precision.reduce_dtype == precision
assert self.layer.mixed_precision.buffer_dtype == precision

for layer_num in [0, 2]:
assert isinstance(self.layer.module[layer_num], FullyShardedDataParallel)
# Assert that the nested layers are set reshard_after_forward to True
assert self.layer.module[layer_num].reshard_after_forward is True

assert self.layer[layer_num].mixed_precision.param_dtype == precision
assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
Expand Down Expand Up @@ -146,9 +140,6 @@ def _assert_layer_fsdp_instance(self) -> None:
precision = torch.float16 if self.precision == 16 else torch.bfloat16
for layer_num in [0, 2]:
assert isinstance(self.layer[layer_num], FullyShardedDataParallel)
# Assert that the nested layers are set reshard_after_forward to True
assert self.layer[layer_num].reshard_after_forward

assert self.layer[layer_num].mixed_precision.param_dtype == precision
assert self.layer[layer_num].mixed_precision.reduce_dtype == precision
assert self.layer[layer_num].mixed_precision.buffer_dtype == precision
Expand Down

0 comments on commit 91d7a38

Please sign in to comment.