update CI testing with pip upgrade (#2380)

* try pt1.5 * cpu * upgrade * tpu * user * [blocked by #2380] freeze GPU PT 1.4 (#2780) * freeze * user
Lightning-AI · Jul 31, 2020 · 3772601 · 3772601
1 parent bc7a08f
commit 3772601
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 59 deletions.
diff --git a/.drone.yml b/.drone.yml
@@ -6,7 +6,7 @@ name: torch-GPU
 
 steps:
 - name: testing
-  image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5
+  image: pytorchlightning/pytorch_lightning:devel-pt1.4
 
   environment:
     SLURM_LOCALID: 0

diff --git a/.github/workflows/ci-test-base.yml b/.github/workflows/ci-test-base.yml
@@ -57,7 +57,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade --user pip
-        pip install --requirement ./requirements/base.txt --quiet  --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade-strategy only-if-needed
+        pip install --requirement ./requirements/base.txt --quiet  --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
         pip install --requirement ./requirements/test.txt --quiet --upgrade-strategy only-if-needed
         # pip install tox coverage
         python --version

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
@@ -42,7 +42,7 @@ jobs:
 
     - name: Update Pip
       run: |
-        pip install -U -q "pip>=20.1"  # needed for get pip cacher folder
+        pip install --quiet "pip>=20.1" --upgrade --user  # needed for get pip cacher folder
 
     # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646
     - name: Setup macOS
@@ -54,14 +54,9 @@ jobs:
     - name: Setup Windows
       if: runner.os == 'windows'
       run: |
+        # remove Horovod from requirements
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)"
 
-    # TODO: remove after https://github.com/pytorch/pytorch/issues/32186 is resolved
-    #- name: Setup Windows on Latest
-    #  if: runner.os == 'windows' && matrix.requires == 'latest'
-    #  run: |
-    #    python -c "fname = 'requirements/base.txt' ; req = open(fname).read().replace('torch>=1.3', 'torch<1.5') ; open(fname, 'w').write(req)"
-
     # versions <= 1.3 may have issues on mac with some BLAS ops due to missing mkl (https://github.com/pytorch/pytorch/issues/18996)
     - name: Setup MacOS Minimal
       if: runner.os == 'macOS' && matrix.requires == 'minimal'
@@ -92,8 +87,8 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install --requirement requirements/base.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade-strategy only-if-needed
-        # pip install -q "PyYAML>=5.3.1"  # needed for installing dependencues
+        # python -m pip install --upgrade --user pip
+        pip install --requirement requirements/base.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade
         HOROVOD_BUILD_ARCH_FLAGS="-mfma" pip install --requirement ./requirements/devel.txt --quiet --upgrade-strategy "only-if-needed"
         python --version
         pip --version

diff --git a/README.md b/README.md
@@ -437,6 +437,8 @@ You can also install any past release `0.X.Y` from this repository:
 pip install https://github.com/PytorchLightning/pytorch-lightning/archive/0.X.Y.zip --upgrade
 ```
 
+---
+
 ## Lightning team
 
 #### Leads

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
@@ -51,42 +51,24 @@ def test_model_tpu_cores_1(tmpdir):
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
+@pytest.mark.parametrize('tpu_core', [1, 5])
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_tpu_index_1(tmpdir):
-    """Make sure model trains on TPU."""
-    trainer_options = dict(
-        default_root_dir=tmpdir,
-        progress_bar_refresh_rate=0,
-        max_epochs=1,
-        distributed_backend='tpu',
-        tpu_cores=[1],
-        limit_train_batches=0.4,
-        limit_val_batches=0.4,
-    )
-
-    model = EvalModelTemplate()
-    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
-
-
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pl_multi_process_test
-def test_model_tpu_index_5(tmpdir):
+def test_model_tpu_index(tmpdir, tpu_core):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         distributed_backend='tpu',
-        tpu_cores=[5],
+        tpu_cores=[tpu_core],
         limit_train_batches=0.4,
         limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
+    assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -131,24 +113,27 @@ def test_model_16bit_tpu_cores_1(tmpdir):
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
+@pytest.mark.parametrize('tpu_core', [1, 5])
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_16bit_tpu_index_1(tmpdir):
+def test_model_16bit_tpu_index(tmpdir, tpu_core):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
         precision=16,
         progress_bar_refresh_rate=0,
+        train_percent_check=0.4,
+        val_percent_check=0.2,
         max_epochs=1,
         distributed_backend='tpu',
-        tpu_cores=[1],
+        tpu_cores=[tpu_core],
         limit_train_batches=0.4,
         limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
+    assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
@@ -177,27 +162,7 @@ def test_model_16bit_tpu_cores_8(tmpdir):
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_16bit_tpu_index_5(tmpdir):
-    """Test if distributed TPU core training works"""
-    model = EvalModelTemplate()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        precision=16,
-        max_epochs=1,
-        train_percent_check=0.4,
-        val_percent_check=0.2,
-        distributed_backend='tpu',
-        tpu_cores=[5],
-    )
-    trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
-    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
-
-
-@pytest.mark.parametrize('tpu_core', [1, 5])
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pl_multi_process_test
-def test_train_on_single_tpu(tmpdir, tpu_core):
+def test_model_tpu_early_stop(tmpdir):
     """Test if single TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
@@ -208,10 +173,9 @@ def test_train_on_single_tpu(tmpdir, tpu_core):
         limit_train_batches=10,
         limit_val_batches=10,
         distributed_backend='tpu',
-        tpu_cores=[tpu_core],
+        tpu_cores=1,
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")