diff --git a/.codecov.yml b/.codecov.yml
index 6abf8b1a16a8d0..3bcfe7fb9f624e 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -9,7 +9,7 @@ codecov:
strict_yaml_branch: "yaml-config"
require_ci_to_pass: yes
notify:
- after_n_builds: 21
+ after_n_builds: 22
wait_for_ci: yes
# https://docs.codecov.io/docs/codecov-yaml#section-expired-reports
max_report_age: off
@@ -50,4 +50,4 @@ comment:
layout: header, diff
require_changes: false
behavior: default # update if exists else create new
- after_n_builds: 21
+ after_n_builds: 22
diff --git a/.drone.yml b/.drone.yml
index 71532d96ed5a48..edb6f48bbb0e3d 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -12,6 +12,7 @@ steps:
SLURM_LOCALID: 0
CODECOV_TOKEN:
from_secret: codecov_token
+ MKL_THREADING_LAYER: GNU
HOROVOD_GPU_ALLREDUCE: NCCL
HOROVOD_GPU_BROADCAST: NCCL
HOROVOD_WITH_PYTORCH: 1
@@ -33,10 +34,10 @@ steps:
- nvidia-smi
#- bash ./tests/install_AMP.sh
- apt-get update && apt-get install -y cmake
- - pip install -r ./requirements/base.txt --user -q
- - pip install -r ./requirements/devel.txt --user -q
+ - pip install -r ./requirements/base.txt --user -q --upgrade-strategy only-if-needed
+ - pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed
#- pip install -r ./requirements/docs.txt --user -q
- - pip install -r ./requirements/examples.txt --user -q
+ - pip install -r ./requirements/examples.txt --user -q --upgrade-strategy only-if-needed
- pip list
- python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')"
- coverage run --source pytorch_lightning -m py.test pytorch_lightning tests -v --durations=25 # --flake8
diff --git a/.github/workflows/ci-test-base.yml b/.github/workflows/ci-test-base.yml
index e1f64150544806..855a9831fd878c 100644
--- a/.github/workflows/ci-test-base.yml
+++ b/.github/workflows/ci-test-base.yml
@@ -47,7 +47,7 @@ jobs:
python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)"
- name: Cache pip
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }}
@@ -57,7 +57,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade --user pip
- pip install --requirement ./requirements/base.txt --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade-strategy only-if-needed
+ pip install --requirement ./requirements/base.txt --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
pip install --requirement ./requirements/test.txt --quiet --upgrade-strategy only-if-needed
# pip install tox coverage
python --version
@@ -66,7 +66,7 @@ jobs:
shell: bash
- name: Cache datasets
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: Datasets # This path is specific to Ubuntu
# Look to see if there is a cache hit for the corresponding requirements file
diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index a3b9f42ef952af..0ed2db475546f2 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -30,10 +30,6 @@ jobs:
# TODO: temporary fix till hanging jobs on macOS for py38 is resolved
- python-version: 3.8
os: macOS-10.15
- # TODO: temporary fix till pyYaml can be installed, see: https://github.com/actions/setup-python/issues/114
- - python-version: 3.7
- os: ubuntu-18.04
- requires: 'minimal'
# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes: 25
@@ -44,6 +40,10 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
+ - name: Update Pip
+ run: |
+ pip install --quiet "pip>=20.1" --upgrade --user # needed for get pip cacher folder
+
# Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646
- name: Setup macOS
if: runner.os == 'macOS'
@@ -54,14 +54,9 @@ jobs:
- name: Setup Windows
if: runner.os == 'windows'
run: |
+ # remove Horovod from requirements
python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)"
- # TODO: remove after https://github.com/pytorch/pytorch/issues/32186 is resolved
- - name: Setup Windows on Latest
- if: runner.os == 'windows' && matrix.requires == 'latest'
- run: |
- python -c "fname = 'requirements/base.txt' ; req = open(fname).read().replace('torch>=1.3', 'torch<1.5') ; open(fname, 'w').write(req)"
-
# versions <= 1.3 may have issues on mac with some BLAS ops due to missing mkl (https://github.com/pytorch/pytorch/issues/18996)
- name: Setup MacOS Minimal
if: runner.os == 'macOS' && matrix.requires == 'minimal'
@@ -77,23 +72,23 @@ jobs:
# Note: This uses an internal pip API and may not always work
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
- - name: Get pip cache
+ - name: Get pip cache dir
id: pip-cache
run: |
- python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)"
+ echo "::set-output name=dir::$(pip cache dir)"
- - name: Cache pip
- uses: actions/cache@v1
+ - name: pip cache
+ uses: actions/cache@v2
with:
path: ${{ steps.pip-cache.outputs.dir }}
- key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/extra.txt') }}
+ key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/extra.txt') }}
restore-keys: |
- ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ matrix.requires }}-pip-
+ ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-pip-
- name: Install dependencies
run: |
# python -m pip install --upgrade --user pip
- pip install --requirement requirements/base.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade-strategy only-if-needed
+ pip install --requirement requirements/base.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade
HOROVOD_BUILD_ARCH_FLAGS="-mfma" pip install --requirement ./requirements/devel.txt --quiet --upgrade-strategy "only-if-needed"
python --version
pip --version
@@ -112,7 +107,7 @@ jobs:
shell: bash
- name: Cache datasets
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: Datasets # This path is specific to Ubuntu
# Look to see if there is a cache hit for the corresponding requirements file
diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml
index 9938be4d9ff866..8ae2f6829bb48e 100644
--- a/.github/workflows/code-formatting.yml
+++ b/.github/workflows/code-formatting.yml
@@ -42,12 +42,12 @@ jobs:
# Note: This uses an internal pip API and may not always work
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
- name: Cache pip
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: ~/.cache/pip
- key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/extra.txt') }}
+ key: ${{ runner.os }}-pip-extras-${{ hashFiles('requirements/base.txt') }}-${{ hashFiles('requirements/extra.txt') }}
restore-keys: |
- ${{ runner.os }}-pip-
+ ${{ runner.os }}-pip-extras-
- name: Install dependencies
run: |
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 714d15f191a373..5cb1aec47abc4b 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -15,7 +15,7 @@ jobs:
strategy:
matrix:
python_version: [3.6, 3.7, 3.8]
- pytorch_version: [1.3, 1.4, 1.5]
+ pytorch_version: [1.3, 1.4, 1.5, 1.6]
exclude:
# excludes PT 1.3 as it is missing on pypi
- python_version: 3.8
@@ -82,3 +82,29 @@ jobs:
build_args: PYTHON_VERSION=${{ matrix.python_version }}
tags: "XLA-extras-py${{ matrix.python_version }}"
timeout-minutes: 25
+
+ build-cuda:
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ python_version: [3.7]
+ pytorch_version: [1.3, 1.4, 1.5, 1.6.0]
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+
+ - name: Publish Master to Docker
+ # publish master
+ uses: docker/build-push-action@v1.1.0
+ if: github.event_name == 'push'
+ with:
+ repository: pytorchlightning/pytorch_lightning
+ username: ${{ secrets.DOCKER_USERNAME }}
+ password: ${{ secrets.DOCKER_PASSWORD }}
+ dockerfile: dockers/tpu-extras/Dockerfile
+ build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
+ tags: "cuda-extras-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
+ timeout-minutes: 40
diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index c4d4893cd668cd..b9442216052996 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -29,7 +29,7 @@ jobs:
# Note: This uses an internal pip API and may not always work
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
- name: Cache pip
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }}
@@ -67,7 +67,7 @@ jobs:
# Note: This uses an internal pip API and may not always work
# https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
- name: Cache pip
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('requirements/base.txt') }}
diff --git a/.github/workflows/pt-conda.yml b/.github/workflows/pt-conda.yml
index 98c9a038ed23fa..a5e41530d23e5c 100644
--- a/.github/workflows/pt-conda.yml
+++ b/.github/workflows/pt-conda.yml
@@ -23,10 +23,10 @@ jobs:
os: [ubuntu-20.04]
python-version: [3.7]
# todo: add nightly versions
- pytorch-version: [1.3, 1.4, 1.5] # , 1.6, 1.7
+ pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7
# Timeout: https://stackoverflow.com/a/59076067/4521646
- timeout-minutes: 20
+ timeout-minutes: 35
steps:
- uses: actions/checkout@v2
@@ -38,12 +38,21 @@ jobs:
# TODO: set source for nightly
- name: Cache conda
- uses: actions/cache@v1
- env: # Increase this value to reset cache if etc/example-environment.yml has not changed
- CACHE_NUMBER: 0
+ uses: actions/cache@v2
with:
path: ~/conda_pkgs_dir
- key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('environment.yml') }}
+ key: ${{ runner.os }}-conda-py${{ matrix.python-version }}-pt${{ matrix.pytorch-version }}-${{ hashFiles('environment.yml') }}
+ restore-keys: |
+ ${{ runner.os }}-conda-py${{ matrix.python-version }}-pt${{ matrix.pytorch-version }}-
+
+ # Add another cache for Pip as not all packages lives in Conda env
+ - name: Cache pip
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/pip
+ key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-pt${{ matrix.pytorch-version }}-${{ hashFiles('requirements/base.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-pip-py${{ matrix.python-version }}-pt${{ matrix.pytorch-version }}-
# https://docs.conda.io/projects/conda/en/4.6.0/_downloads/52a95608c49671267e40c689e0bc00ca/conda-cheatsheet.pdf
# https://gist.github.com/mwouts/9842452d020c08faf9e84a3bba38a66f
@@ -52,7 +61,8 @@ jobs:
with:
# auto-update-conda: true
auto-activate-base: false
- miniconda-version: 4.7.12
+ # miniconda-version: 4.7.12 # This downloads a new conda, use the conda-version
+ conda-version: 4.7.12
python-version: ${{ matrix.python-version }}
environment-file: environment.yml
activate-environment: pl-env
@@ -70,7 +80,7 @@ jobs:
shell: bash -l {0}
- name: Cache datasets
- uses: actions/cache@v1
+ uses: actions/cache@v2
with:
path: Datasets # This path is specific to Ubuntu
# Look to see if there is a cache hit for the corresponding requirements file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f1382f621d2eba..c44db54cac5a1d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,8 +12,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added SSIM metrics ([#2671](https://github.com/PyTorchLightning/pytorch-lightning/pull/2671))
- Added BLEU metrics ([#2535](https://github.com/PyTorchLightning/pytorch-lightning/pull/2535))
+- Added support to export a model to ONNX format ([#2596](https://github.com/PyTorchLightning/pytorch-lightning/pull/2596))
+
- Added support for `Trainer(num_sanity_val_steps=-1)` to check all validation data before training ([#2246](https://github.com/PyTorchLightning/pytorch-lightning/pull/2246))
+- Added support for PyTorch 1.6 ([#2745](https://github.com/PyTorchLightning/pytorch-lightning/pull/2745))
+
### Changed
- Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594))
@@ -43,6 +47,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed test metrics not being logged with `LoggerCollection` ([#2723](https://github.com/PyTorchLightning/pytorch-lightning/pull/2723))
+- Fixed data transfer to device when using `torchtext.data.Field` and `include_lengths is True` ([#2689](https://github.com/PyTorchLightning/pytorch-lightning/pull/2689))
+
## [0.8.5] - 2020-07-09
### Added
diff --git a/README.md b/README.md
index 8e4de149192860..2b2132799965dd 100644
--- a/README.md
+++ b/README.md
@@ -38,14 +38,14 @@
## Continuous Integration
-| System / PyTorch ver. | 1.3 (min. req.) | 1.4 | 1.5 (latest) |
-| :---: | :---: | :---: | :---: |
-| Conda py3.7 [linux] | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) |
-| Linux py3.7 [GPU] | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
-| Linux py3.7 [TPU] | - | - | ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg) |
-| Linux py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
-| OSX py3.6 / py3.7 | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
-| Windows py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |[![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+| System / PyTorch ver. | 1.3 (min. req.) [w/o py3.8] | 1.4 | 1.5 | 1.6 (latest) |
+| :---: | :---: | :---: | :---: | :---: |
+| Conda py3.7 [linux] | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) |
+| Linux py3.7 [GPU] | - | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
+| Linux py3.7 [TPU] | - | - | - | ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg) |
+| Linux py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
+| OSX py3.6 / py3.7 | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
+| Windows py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22)
@@ -437,6 +437,8 @@ You can also install any past release `0.X.Y` from this repository:
pip install https://github.com/PytorchLightning/pytorch-lightning/archive/0.X.Y.zip --upgrade
```
+---
+
## Lightning team
#### Leads
diff --git a/dockers/README.md b/dockers/README.md
index b03c3d7a578910..7b3063e00f79c4 100644
--- a/dockers/README.md
+++ b/dockers/README.md
@@ -12,7 +12,7 @@ or with specific arguments
```bash
git clone
docker image build \
- -t pytorch-lightning:py38 \
+ -t pytorch-lightning:py3.8 \
-f dockers/conda/Dockerfile \
--build-arg PYTHON_VERSION=3.8 \
--build-arg PYTORCH_VERSION=1.4 \
diff --git a/dockers/cuda-extras/Dockerfile b/dockers/cuda-extras/Dockerfile
new file mode 100644
index 00000000000000..c4bc5cfb641fd9
--- /dev/null
+++ b/dockers/cuda-extras/Dockerfile
@@ -0,0 +1,40 @@
+# Existing images:
+# --build-arg TORCH_VERSION=1.6.0 --build-arg CUDA_VERSION=10.1
+# --build-arg TORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.1
+# --build-arg TORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1
+# --build-arg TORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1
+# --build-arg TORCH_VERSION=1.2 --build-arg CUDA_VERSION=10.0
+# --build-arg TORCH_VERSION=1.1.0 --build-arg CUDA_VERSION=10.0 --build-arg CUDNN_VERSION=7.5
+
+ARG TORCH_VERSION=1.6
+ARG CUDA_VERSION=10.1
+ARG CUDNN_VERSION=7
+
+FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel
+
+ENV HOROVOD_GPU_ALLREDUCE=NCCL
+ENV HOROVOD_GPU_BROADCAST=NCCL
+ENV HOROVOD_WITH_PYTORCH=1
+ENV HOROVOD_WITHOUT_TENSORFLOW=1
+ENV HOROVOD_WITHOUT_MXNET=1
+ENV HOROVOD_WITH_GLOO=1
+ENV HOROVOD_WITHOUT_MPI=1
+ENV PATH="$PATH:/root/.local/bin"
+ENV MAKEFLAGS="-j$(nproc)"
+
+COPY ./tests/install_AMP.sh install_AMP.sh
+COPY ./requirements/base.txt requirements.txt
+COPY ./requirements/extra.txt requirements-extra.txt
+COPY ./requirements/test.txt requirements-tests.txt
+COPY ./requirements/examples.txt requirements-examples.txt
+
+RUN apt-get update && apt-get install -y cmake && \
+ # Install AMP
+ bash install_AMP.sh && \
+ pip install -r requirements.txt && \
+ # HOROVOD_BUILD_ARCH_FLAGS="-mfma" && \
+ pip install -r requirements-extra.txt && \
+ pip install -r requirements-examples.txt && \
+ pip install -r requirements-tests.txt && \
+ rm requirements* && \
+ pip list
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4b1b7c697a6c80..3637892848b513 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -99,6 +99,7 @@ PyTorch Lightning Documentation
transfer_learning
tpu
test_set
+ production_inference
.. toctree::
:maxdepth: 1
diff --git a/docs/source/production_inference.rst b/docs/source/production_inference.rst
new file mode 100644
index 00000000000000..3159abe630b686
--- /dev/null
+++ b/docs/source/production_inference.rst
@@ -0,0 +1,28 @@
+Inference in Production
+=======================
+PyTorch Lightning eases the process of deploying models into production.
+
+
+Exporting to ONNX
+-----------------
+PyTorch Lightning provides a handy function to quickly export your model to ONNX format, which allows the model to be independent of PyTorch and run on an ONNX Runtime.
+
+To export your model to ONNX format call the `to_onnx` function on your Lightning Module with the filepath and input_sample.
+
+.. code-block:: python
+
+ filepath = 'model.onnx'
+ model = SimpleModel()
+ input_sample = torch.randn((1, 64))
+ model.to_onnx(filepath, input_sample, export_params=True)
+
+You can also skip passing the input sample if the `example_input_array` property is specified in your LightningModule.
+
+Once you have the exported model, you can run it on your ONNX runtime in the following way:
+
+.. code-block:: python
+
+ ort_session = onnxruntime.InferenceSession(filepath)
+ input_name = ort_session.get_inputs()[0].name
+ ort_inputs = {input_name: np.random.randn(1, 64).astype(np.float32)}
+ ort_outs = ort_session.run(None, ort_inputs)
diff --git a/environment.yml b/environment.yml
index 9c48f6d7e2c398..07afe8055753f2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -48,3 +48,4 @@ dependencies:
- wandb>=0.8.21
- neptune-client>=0.4.109
- horovod>=0.19.1
+ - onnxruntime>=1.3.0
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index afb9fa0a9266a6..5ff64156e1b337 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -2,6 +2,7 @@
import inspect
import os
import re
+import tempfile
from abc import ABC, abstractmethod
from argparse import Namespace
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -1723,6 +1724,44 @@ def _set_hparams(self, hp: Union[dict, Namespace, str]) -> None:
else:
self._hparams = hp
+ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwargs):
+ """Saves the model in ONNX format
+
+ Args:
+ file_path: The path of the file the model should be saved to.
+ input_sample: A sample of an input tensor for tracing.
+ **kwargs: Will be passed to torch.onnx.export function.
+
+ Example:
+ >>> class SimpleModel(LightningModule):
+ ... def __init__(self):
+ ... super().__init__()
+ ... self.l1 = torch.nn.Linear(in_features=64, out_features=4)
+ ...
+ ... def forward(self, x):
+ ... return torch.relu(self.l1(x.view(x.size(0), -1)))
+
+ >>> with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmpfile:
+ ... model = SimpleModel()
+ ... input_sample = torch.randn((1, 64))
+ ... model.to_onnx(tmpfile.name, input_sample, export_params=True)
+ ... os.path.isfile(tmpfile.name)
+ True
+ """
+
+ if isinstance(input_sample, Tensor):
+ input_data = input_sample
+ elif self.example_input_array is not None:
+ input_data = self.example_input_array
+ else:
+ raise ValueError(f'input_sample and example_input_array tensors are both missing.')
+
+ if 'example_outputs' not in kwargs:
+ self.eval()
+ kwargs['example_outputs'] = self(input_data)
+
+ torch.onnx.export(self, input_data, file_path, **kwargs)
+
@property
def hparams(self) -> Union[AttributeDict, str]:
if not hasattr(self, '_hparams'):
diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py
index 188e9cf8de0dd7..c4425b036f1e5f 100644
--- a/pytorch_lightning/trainer/__init__.py
+++ b/pytorch_lightning/trainer/__init__.py
@@ -118,7 +118,7 @@ def forward(self, x):
---------------
To ensure full reproducibility from run to run you need to set seeds for pseudo-random generators,
-and set ``deterministic``` flag in ``Trainer``.
+and set ``deterministic`` flag in ``Trainer``.
Example::
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 012ad4876a1da8..6ba0ff8678b21b 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -130,12 +130,14 @@ def train_fx(trial_hparams, cluster_manager, _):
import os
import re
from abc import ABC, abstractmethod
+from distutils.version import LooseVersion
from typing import Union, List, Optional, Callable, Tuple
import subprocess
import sys
from time import sleep
import numpy as np
from os.path import abspath
+from pkg_resources import parse_version
import torch
from pytorch_lightning import _logger as log
@@ -275,9 +277,11 @@ def set_distributed_mode(self, distributed_backend):
elif self.num_gpus == 1:
self.use_single_gpu = True
elif self.num_gpus > 1:
- rank_zero_warn('You requested multiple GPUs but did not specify a backend, e.g.'
- ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
- ' Setting distributed_backend=ddp_spawn for you.')
+ rank_zero_warn(
+ 'You requested multiple GPUs but did not specify a backend, e.g.'
+ ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
+ ' Setting distributed_backend=ddp_spawn for you.'
+ )
self.distributed_backend = 'ddp_spawn'
distributed_backend = 'ddp_spawn'
@@ -306,8 +310,9 @@ def set_distributed_mode(self, distributed_backend):
self.use_ddp2 = True
elif distributed_backend == "ddp_cpu":
if self.num_gpus > 0:
- rank_zero_warn('You requested one or more GPUs, but set the backend to `ddp_cpu`.'
- ' Training will not use GPUs.')
+ rank_zero_warn(
+ 'You requested one or more GPUs, but set the backend to `ddp_cpu`. Training will not use GPUs.'
+ )
self.use_ddp = True
self.data_parallel_device_ids = None
self.on_gpu = False
@@ -380,8 +385,7 @@ def determine_ddp_node_rank(self):
if len(node_ids) == 0:
return 0
if len(node_ids) > 1:
- log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. "
- f"Using the first one.")
+ log.warning(f"Multiple environment variables ({node_ids}) defined for node rank. Using the first one.")
k, rank = node_ids.pop()
rank_zero_info(f"Using environment variable {k} for node rank ({rank}).")
return int(rank)
@@ -614,7 +618,13 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
last_path = None
if not self.testing and best_model_path is not None and len(best_model_path) > 0:
last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
- torch.save(model.state_dict(), last_path)
+ # Can't use the new zipfile serialization for 1.6.0 because there's a bug in
+ # torch.hub.load_state_dict_from_url() that prevents it from loading the new files.
+ # More details can be found here: https://github.com/pytorch/pytorch/issues/42239
+ if LooseVersion(torch.__version__).version[:3] == [1, 6, 0]:
+ torch.save(model.state_dict(), last_path, _use_new_zipfile_serialization=False)
+ else:
+ torch.save(model.state_dict(), last_path)
mp_queue.put(last_path)
def save_spawn_weights(self, model):
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index 3234800a62c51c..381331fae2b096 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -87,7 +87,9 @@
import re
import signal
from abc import ABC
+from distutils.version import LooseVersion
from subprocess import call
+from pkg_resources import parse_version
import torch
import torch.distributed as torch_distrib
@@ -151,8 +153,7 @@ class TrainerIOMixin(ABC):
scaler: ...
def get_model(self):
- is_dp_module = isinstance(self.model, (LightningDistributedDataParallel,
- LightningDataParallel))
+ is_dp_module = isinstance(self.model, (LightningDistributedDataParallel, LightningDataParallel))
model = self.model.module if is_dp_module else self.model
return model
@@ -261,7 +262,13 @@ def _atomic_save(self, checkpoint, filepath: str):
This points to the file that the checkpoint will be stored in.
"""
tmp_path = str(filepath) + ".part"
- torch.save(checkpoint, tmp_path)
+ # Can't use the new zipfile serialization for 1.6.0 because there's a bug in
+ # torch.hub.load_state_dict_from_url() that prevents it from loading the new files.
+ # More details can be found here: https://github.com/pytorch/pytorch/issues/42239
+ if LooseVersion(torch.__version__).version[:3] == [1, 6, 0]:
+ torch.save(checkpoint, tmp_path, _use_new_zipfile_serialization=False)
+ else:
+ torch.save(checkpoint, tmp_path)
os.replace(tmp_path, filepath)
def save_checkpoint(self, filepath, weights_only: bool = False):
@@ -274,8 +281,9 @@ def save_checkpoint(self, filepath, weights_only: bool = False):
except AttributeError as err:
if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:
del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY]
- rank_zero_warn('Warning, `module_arguments` dropped from checkpoint.'
- f' An attribute is not picklable {err}')
+ rank_zero_warn(
+ 'Warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}'
+ )
self._atomic_save(checkpoint, filepath)
def restore(self, checkpoint_path: str, on_gpu: bool):
@@ -493,8 +501,9 @@ def hpc_save(self, folderpath: str, logger):
except AttributeError as err:
if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint:
del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY]
- rank_zero_warn('warning, `module_arguments` dropped from checkpoint.'
- f' An attribute is not picklable {err}')
+ rank_zero_warn(
+ 'warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}'
+ )
self._atomic_save(checkpoint, filepath)
return filepath
diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
index 6f9b8e176ffe1e..75130b297ddccd 100644
--- a/pytorch_lightning/utilities/apply_func.py
+++ b/pytorch_lightning/utilities/apply_func.py
@@ -6,6 +6,7 @@
import torch
import importlib
+
TORCHTEXT_AVAILABLE = importlib.util.find_spec("torchtext") is not None
if TORCHTEXT_AVAILABLE:
from torchtext.data import Batch
@@ -92,6 +93,7 @@ def move_data_to_device(batch: Any, device: torch.device):
- :meth:`torch.Tensor.to`
- :class:`torch.device`
"""
+
def batch_to(data):
# try to move torchtext data first
if TORCHTEXT_AVAILABLE and isinstance(data, Batch):
@@ -99,11 +101,10 @@ def batch_to(data):
# Shallow copy because each Batch has a reference to Dataset which contains all examples
device_data = copy(data)
for field in data.fields:
- # Batch contains output of Field.process(...) which is tensor hence .to(...) exists
- device_field = getattr(data, field).to(device, non_blocking=True)
+ device_field = move_data_to_device(getattr(data, field), device)
setattr(device_data, field, device_field)
return device_data
- else:
- return data.to(device, non_blocking=True)
+
+ return data.to(device, non_blocking=True)
return apply_to_collection(batch, dtype=(TransferableDataType, Batch), function=batch_to)
diff --git a/requirements/base.txt b/requirements/base.txt
index 8eff26906e81eb..4282f6a12d2eb2 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -1,9 +1,9 @@
# the default package dependencies
numpy>=1.16.4
-torch>=1.3, <1.6 # TODO: temporary freeze for Horovod incompatibility with 1.6
+torch>=1.3
tensorboard>=1.14
future>=0.17.1 # required for builtins in setup.py
# pyyaml>=3.13
-PyYAML>=5.1 # OmegaConf requirement
+PyYAML>=5.1 # OmegaConf requirement >=5.1
tqdm>=4.41.0
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 191d24125a21d6..31ea41c083d4b8 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -12,3 +12,5 @@ omegaconf>=2.0.0
# scipy>=0.13.3
scikit-learn>=0.20.0
torchtext>=0.3.1, <0.7 # TODO: temporary fix fix for compatibility
+onnx>=1.7.0
+onnxruntime>=1.3.0
\ No newline at end of file
diff --git a/tests/Dockerfile b/tests/Dockerfile
deleted file mode 100644
index 65c75c1ba34598..00000000000000
--- a/tests/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-ARG TORCH_VERSION=1.4
-ARG CUDA_VERSION=10.1
-
-FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn7-devel
-
-ENV HOROVOD_GPU_ALLREDUCE: NCCL
-ENV HOROVOD_GPU_BROADCAST: NCCL
-ENV HOROVOD_WITH_PYTORCH: 1
-ENV HOROVOD_WITHOUT_TENSORFLOW: 1
-ENV HOROVOD_WITHOUT_MXNET: 1
-ENV HOROVOD_WITH_GLOO: 1
-ENV HOROVOD_WITHOUT_MPI: 1
-ENV PATH: "$PATH:/root/.local/bin"
-ENV MAKEFLAGS: "-j$(nproc)"
-
-COPY ./tests/install_AMP.sh install_AMP.sh
-COPY ./requirements/base.txt requirements.txt
-COPY ./requirements/extra.txt requirements-extra.txt
-COPY ./requirements/test.txt requirements-tests.txt
-
-# Install AMP
-RUN apt-get update && apt-get install -y cmake && \
- bash install_AMP.sh && \
- pip install -r requirements.txt --user && \
- pip install -r requirements-extra.txt --user && \
- pip install -r requirements-tests.txt --user && \
- pip list
diff --git a/tests/README.md b/tests/README.md
index 6286e8b9e81278..ccd62301aa1e25 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -54,7 +54,7 @@ coverage xml
You can build it on your own, note it takes lots of time, be prepared.
```bash
git clone
-docker image build -t pytorch_lightning:devel-torch1.4 -f tests/Dockerfile --build-arg TORCH_VERSION=1.4 .
+docker image build -t pytorch_lightning:devel-torch1.4 -f dockers/cuda-extras/Dockerfile --build-arg TORCH_VERSION=1.4 .
```
To build other versions, select different Dockerfile.
```bash
diff --git a/tests/base/datamodules.py b/tests/base/datamodules.py
index a55a9a718ea9d2..7973420ea478e7 100644
--- a/tests/base/datamodules.py
+++ b/tests/base/datamodules.py
@@ -5,7 +5,6 @@
class TrialMNISTDataModule(LightningDataModule):
-
def __init__(self, data_dir: str = './'):
super().__init__()
self.data_dir = data_dir
diff --git a/tests/base/model_template.py b/tests/base/model_template.py
index f529ce5735b89a..19fcd42195b96b 100644
--- a/tests/base/model_template.py
+++ b/tests/base/model_template.py
@@ -73,9 +73,7 @@ def __init__(
self.test_step_end_called = False
self.test_epoch_end_called = False
- # if you specify an example input, the summary will show input/output for each layer
- # TODO: to be fixed in #1773
- # self.example_input_array = torch.rand(5, 28 * 28)
+ self.example_input_array = torch.rand(5, 28 * 28)
# build model
self.__build_model()
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 4cb52a54610e37..71b38ac6980adb 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -21,19 +21,13 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
- trainer = Trainer(
- default_root_dir=tmpdir,
- checkpoint_callback=checkpoint,
- overfit_batches=0.20,
- max_epochs=2,
- )
+ trainer = Trainer(default_root_dir=tmpdir, checkpoint_callback=checkpoint, overfit_batches=0.20, max_epochs=2)
trainer.fit(model)
- assert checkpoint.dirpath == tmpdir / trainer.logger.name / f'version_0' / 'checkpoints'
+ assert checkpoint.dirpath == tmpdir / trainer.logger.name / 'version_0' / 'checkpoints'
@pytest.mark.parametrize(
- 'logger_version,expected',
- [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
+ 'logger_version,expected', [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
)
def test_model_checkpoint_path(tmpdir, logger_version, expected):
"""Test that "version_" prefix is only added when logger's version is an integer"""
@@ -41,12 +35,7 @@ def test_model_checkpoint_path(tmpdir, logger_version, expected):
model = EvalModelTemplate()
logger = TensorBoardLogger(str(tmpdir), version=logger_version)
- trainer = Trainer(
- default_root_dir=tmpdir,
- overfit_batches=0.2,
- max_epochs=2,
- logger=logger,
- )
+ trainer = Trainer(default_root_dir=tmpdir, overfit_batches=0.2, max_epochs=2, logger=logger)
trainer.fit(model)
ckpt_version = Path(trainer.checkpoint_callback.dirpath).parent.name
@@ -83,8 +72,9 @@ def _save_model(self, filepath, trainer, pl_module):
def on_train_end(self, trainer, pl_module):
super().on_train_end(trainer, pl_module)
# on rank 0 we expect the saved files and on all others no saves
- assert (trainer.global_rank == 0 and self.count == self.expected_count) \
- or (trainer.global_rank > 0 and self.count == 0)
+ assert (trainer.global_rank == 0 and self.count == self.expected_count) or (
+ trainer.global_rank > 0 and self.count == 0
+ )
@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
diff --git a/tests/models/test_onnx_save.py b/tests/models/test_onnx_save.py
new file mode 100644
index 00000000000000..f824f33c93bc14
--- /dev/null
+++ b/tests/models/test_onnx_save.py
@@ -0,0 +1,114 @@
+import os
+
+import onnxruntime
+import pytest
+import torch
+import numpy as np
+import tests.base.develop_pipelines as tpipes
+import tests.base.develop_utils as tutils
+from pytorch_lightning import Trainer
+from tests.base import EvalModelTemplate
+
+
+def test_model_saves_with_input_sample(tmpdir):
+ """Test that ONNX model saves with input sample and size is greater than 3 MB"""
+ model = EvalModelTemplate()
+ trainer = Trainer(max_epochs=1)
+ trainer.fit(model)
+
+ file_path = os.path.join(tmpdir, "model.onxx")
+ input_sample = torch.randn((1, 28 * 28))
+ model.to_onnx(file_path, input_sample)
+ assert os.path.isfile(file_path)
+ assert os.path.getsize(file_path) > 3e+06
+
+
+def test_model_saves_with_example_output(tmpdir):
+ """Test that ONNX model saves when provided with example output"""
+ model = EvalModelTemplate()
+ trainer = Trainer(max_epochs=1)
+ trainer.fit(model)
+
+ file_path = os.path.join(tmpdir, "model.onxx")
+ input_sample = torch.randn((1, 28 * 28))
+ model.eval()
+ example_outputs = model.forward(input_sample)
+ model.to_onnx(file_path, input_sample, example_outputs=example_outputs)
+ assert os.path.exists(file_path) is True
+
+
+def test_model_saves_with_example_input_array(tmpdir):
+ """Test that ONNX model saves with_example_input_array and size is greater than 3 MB"""
+ model = EvalModelTemplate()
+ file_path = os.path.join(tmpdir, "model.onxx")
+ model.to_onnx(file_path)
+ assert os.path.exists(file_path) is True
+ assert os.path.getsize(file_path) > 3e+06
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_model_saves_on_multi_gpu(tmpdir):
+ """Test that ONNX model saves on a distributed backend"""
+ tutils.set_random_master_port()
+
+ trainer_options = dict(
+ default_root_dir=tmpdir,
+ max_epochs=1,
+ limit_train_batches=10,
+ limit_val_batches=10,
+ gpus=[0, 1],
+ distributed_backend='ddp_spawn',
+ progress_bar_refresh_rate=0
+ )
+
+ model = EvalModelTemplate()
+
+ tpipes.run_model_test(trainer_options, model)
+
+ file_path = os.path.join(tmpdir, "model.onxx")
+ model.to_onnx(file_path)
+ assert os.path.exists(file_path) is True
+
+
+def test_verbose_param(tmpdir, capsys):
+ """Test that output is present when verbose parameter is set"""
+ model = EvalModelTemplate()
+ file_path = os.path.join(tmpdir, "model.onxx")
+ model.to_onnx(file_path, verbose=True)
+ captured = capsys.readouterr()
+ assert "graph(%" in captured.out
+
+
+def test_error_if_no_input(tmpdir):
+ """Test that an exception is thrown when there is no input tensor"""
+ model = EvalModelTemplate()
+ model.example_input_array = None
+ file_path = os.path.join(tmpdir, "model.onxx")
+ with pytest.raises(ValueError, match=r'input_sample and example_input_array tensors are both missing'):
+ model.to_onnx(file_path)
+
+
+def test_if_inference_output_is_valid(tmpdir):
+ """Test that the output inferred from ONNX model is same as from PyTorch"""
+ model = EvalModelTemplate()
+ trainer = Trainer(max_epochs=5)
+ trainer.fit(model)
+
+ model.eval()
+ with torch.no_grad():
+ torch_out = model(model.example_input_array)
+
+ file_path = os.path.join(tmpdir, "model.onxx")
+ model.to_onnx(file_path, model.example_input_array, export_params=True)
+
+ ort_session = onnxruntime.InferenceSession(file_path)
+
+ def to_numpy(tensor):
+ return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+ # compute ONNX Runtime output prediction
+ ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(model.example_input_array)}
+ ort_outs = ort_session.run(None, ort_inputs)
+
+ # compare ONNX Runtime and PyTorch results
+ assert np.allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 111ee869684c32..85c609b94d7655 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -309,7 +309,7 @@ def test_model_saving_loading(tmpdir):
hparams_path = os.path.join(hparams_path, 'hparams.yaml')
model_2 = EvalModelTemplate.load_from_checkpoint(
checkpoint_path=new_weights_path,
- hparams_file=hparams_path
+ hparams_file=hparams_path,
)
model_2.eval()
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 8bb2a76a24313d..ecbeb821a3bfa1 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -13,6 +13,7 @@
try:
import torch_xla
import torch_xla.distributed.xla_multiprocessing as xmp
+
SERIAL_EXEC = xmp.MpSerialExecutor()
# TODO: The tests are aborted if the following lines are uncommented. Must be resolved with XLA team
# device = torch_xla.core.xla_model.xla_device()
@@ -24,19 +25,12 @@
TPU_AVAILABLE = True
-_LARGER_DATASET = TrialMNIST(
- download=True,
- num_samples=2000,
- digits=(0, 1, 2, 5, 8),
-)
+_LARGER_DATASET = TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8))
# 8 cores needs a big dataset
def _serial_train_loader():
- return DataLoader(
- _LARGER_DATASET,
- batch_size=32,
- )
+ return DataLoader(_LARGER_DATASET, batch_size=32)
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -57,42 +51,24 @@ def test_model_tpu_cores_1(tmpdir):
tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+@pytest.mark.parametrize('tpu_core', [1, 5])
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
-def test_model_tpu_index_1(tmpdir):
+def test_model_tpu_index(tmpdir, tpu_core):
"""Make sure model trains on TPU."""
trainer_options = dict(
default_root_dir=tmpdir,
progress_bar_refresh_rate=0,
max_epochs=1,
distributed_backend='tpu',
- tpu_cores=[1],
+ tpu_cores=[tpu_core],
limit_train_batches=0.4,
limit_val_batches=0.4,
)
model = EvalModelTemplate()
tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
- assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
-
-
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pl_multi_process_test
-def test_model_tpu_index_5(tmpdir):
- """Make sure model trains on TPU."""
- trainer_options = dict(
- default_root_dir=tmpdir,
- progress_bar_refresh_rate=0,
- max_epochs=1,
- distributed_backend='tpu',
- tpu_cores=[5],
- limit_train_batches=0.4,
- limit_val_batches=0.4,
- )
-
- model = EvalModelTemplate()
- tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
- assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
+ assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -137,24 +113,27 @@ def test_model_16bit_tpu_cores_1(tmpdir):
assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
+@pytest.mark.parametrize('tpu_core', [1, 5])
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
-def test_model_16bit_tpu_index_1(tmpdir):
+def test_model_16bit_tpu_index(tmpdir, tpu_core):
"""Make sure model trains on TPU."""
trainer_options = dict(
default_root_dir=tmpdir,
precision=16,
progress_bar_refresh_rate=0,
+ train_percent_check=0.4,
+ val_percent_check=0.2,
max_epochs=1,
distributed_backend='tpu',
- tpu_cores=[1],
+ tpu_cores=[tpu_core],
limit_train_batches=0.4,
limit_val_batches=0.4,
)
model = EvalModelTemplate()
tpipes.run_model_test(trainer_options, model, on_gpu=False)
- assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
+ assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
@@ -183,45 +162,7 @@ def test_model_16bit_tpu_cores_8(tmpdir):
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
-def test_model_16bit_tpu_index_5(tmpdir):
- """Test if distributed TPU core training works"""
- model = EvalModelTemplate()
- trainer = Trainer(
- default_root_dir=tmpdir,
- precision=16,
- max_epochs=1,
- train_percent_check=0.4,
- val_percent_check=0.2,
- distributed_backend='tpu',
- tpu_cores=[5],
- )
- trainer.fit(model)
- assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
- assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
-
-
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pl_multi_process_test
-def test_early_stop_checkpoints_on_tpu(tmpdir):
- """Test if single TPU core training works"""
- model = EvalModelTemplate()
- trainer = Trainer(
- early_stop_callback=True,
- default_root_dir=tmpdir,
- progress_bar_refresh_rate=0,
- max_epochs=50,
- limit_train_batches=10,
- limit_val_batches=10,
- distributed_backend='tpu',
- tpu_cores=[1],
- )
- trainer.fit(model)
- assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
-
-
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pl_multi_process_test
-def test_early_stop_checkpoints_on_tpu(tmpdir):
+def test_model_tpu_early_stop(tmpdir):
"""Test if single TPU core training works"""
model = EvalModelTemplate()
trainer = Trainer(
@@ -232,10 +173,9 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
limit_train_batches=10,
limit_val_batches=10,
distributed_backend='tpu',
- tpu_cores=[5],
+ tpu_cores=1,
)
trainer.fit(model)
- assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -264,26 +204,15 @@ def test_dataloaders_passed_to_fit(tmpdir):
model = EvalModelTemplate()
- trainer = Trainer(
- default_root_dir=tmpdir,
- max_epochs=1,
- distributed_backend='tpu',
- tpu_cores=8,
- )
- result = trainer.fit(
- model,
- train_dataloader=model.train_dataloader(),
- val_dataloaders=model.val_dataloader(),
- )
+ trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, distributed_backend='tpu', tpu_cores=8)
+ result = trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
assert result, "TPU doesn't work with dataloaders passed to fit()."
-@pytest.mark.parametrize(['tpu_cores', 'expected_tpu_id'], [
- pytest.param(1, None),
- pytest.param(8, None),
- pytest.param([1], 1),
- pytest.param([8], 8),
-])
+@pytest.mark.parametrize(
+ ['tpu_cores', 'expected_tpu_id'],
+ [pytest.param(1, None), pytest.param(8, None), pytest.param([1], 1), pytest.param([8], 8)],
+)
def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id):
"""Test if trainer.tpu_id is set as expected"""
assert Trainer(tpu_cores=tpu_cores).tpu_id == expected_tpu_id
@@ -293,8 +222,7 @@ def test_tpu_misconfiguration():
"""Test if trainer.tpu_id is set as expected"""
with pytest.raises(MisconfigurationException, match="`tpu_cores` can only be"):
Trainer(
- tpu_cores=[1, 8],
- distributed_backend='tpu',
+ tpu_cores=[1, 8], distributed_backend='tpu',
)
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 27fa6815509620..c7652ebecf3f9a 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -61,7 +61,7 @@ def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
ckpt_path = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}' if url_ckpt else new_weights_path
model_2 = EvalModelTemplate.load_from_checkpoint(
checkpoint_path=ckpt_path,
- hparams_file=hparams_path
+ hparams_file=hparams_path,
)
model_2.eval()
@@ -99,7 +99,7 @@ def test_no_val_end_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
ckpt_path = f'http://{tmpdir_server[0]}:{tmpdir_server[1]}/{os.path.basename(new_weights_path)}' if url_ckpt else new_weights_path
model_2 = EvalModelTemplate.load_from_checkpoint(
checkpoint_path=ckpt_path,
- hparams_file=hparams_path
+ hparams_file=hparams_path,
)
model_2.eval()
diff --git a/tests/utilities/test_apply_func_torchtext.py b/tests/utilities/test_apply_func_torchtext.py
new file mode 100644
index 00000000000000..9ea29420788d7e
--- /dev/null
+++ b/tests/utilities/test_apply_func_torchtext.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+import torchtext
+from torchtext.data.example import Example
+
+from pytorch_lightning.utilities.apply_func import move_data_to_device
+
+
+def _get_torchtext_data_iterator(include_lengths=False):
+ text_field = torchtext.data.Field(sequential=True, pad_first=False, # nosec
+ init_token="", eos_token="", # nosec
+ include_lengths=include_lengths) # nosec
+
+ example1 = Example.fromdict({"text": "a b c a c"}, {"text": ("text", text_field)})
+ example2 = Example.fromdict({"text": "b c a a"}, {"text": ("text", text_field)})
+ example3 = Example.fromdict({"text": "c b a"}, {"text": ("text", text_field)})
+
+ dataset = torchtext.data.Dataset(
+ [example1, example2, example3],
+ {"text": text_field},
+ )
+ text_field.build_vocab(dataset)
+
+ iterator = torchtext.data.Iterator(dataset, batch_size=3,
+ sort_key=None, device=None,
+ batch_size_fn=None,
+ train=True, repeat=False, shuffle=None,
+ sort=None, sort_within_batch=None)
+ return iterator, text_field
+
+
+@pytest.mark.parametrize('include_lengths', [False, True])
+@pytest.mark.parametrize(['device'], [pytest.param(torch.device('cuda', 0))])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test assumes GPU machine")
+def test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, device):
+ data_iterator, _ = _get_torchtext_data_iterator(include_lengths=include_lengths)
+ data_iter = iter(data_iterator)
+ batch = next(data_iter)
+ batch_on_device = move_data_to_device(batch, device)
+
+ if include_lengths:
+ # tensor with data
+ assert (batch_on_device.text[0].device == device)
+ # tensor with length of data
+ assert (batch_on_device.text[1].device == device)
+ else:
+ assert (batch_on_device.text.device == device)
+
+
+@pytest.mark.parametrize('include_lengths', [False, True])
+def test_batch_move_data_to_device_torchtext_include_lengths_cpu(include_lengths):
+ test_batch_move_data_to_device_torchtext_include_lengths(include_lengths, torch.device('cpu'))