From b0051e8c036fa3312ad4d37aa7141bea64ac6148 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Tue, 5 Jan 2021 09:52:35 +0900
Subject: [PATCH] Add non-existing resume_from_checkpoint acceptance for
 auto-resubmit (#4402)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add empty resume_from_checkpoint acceptance #4366

* Fix general error catch with focused file check

* Add fsspec HTTP extras

Add fsspec's HTTPFileSystem  support through http extras.
pl has supported remote http file (e.g. #2925),
so this commit do not add new functionality.

* Fix potential too much logging in DDP

* Add PR changelog

* Add well-written argument explanation

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix DDP-compatible restore logging

Notify from where the states are restored.
This feature temporally deleted as a result of PR review.
With succeeding review, added with DDP compatibility.

* Fix utility import pathes

* Refactor load step commentaries

* Refactor hpc ckpt suffix acquisition

* Refactor restore/hpc_load match

* Refactor hpc load trial

* Refactor checkpoint dir check

* Refactor unneeded function nest

* Refactor nested If

* Refactor duplicated cache clear

* Refactor attempt flow with if/elif

* Fix pip8

* Refactor hook commentary

Co-authored-by: chaton <thomas@grid.ai>

* Fix pep8

* Refactor hpc load checkpoint path acquisition

* Fix pip8

* Fix typo

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix typo

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix doc

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Refactor None Union type with Optional

* Fix build-doc CI failure debuged in #5329

* Fix fsspec import during build-doc #5329

* Fix test epoch

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Fix test with latest test models

* .

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 CHANGELOG.md                                  |  2 ++
 docs/source/conf.py                           |  6 +++++-
 environment.yml                               |  2 +-
 .../connectors/checkpoint_connector.py        | 12 +++++++++--
 pytorch_lightning/trainer/trainer.py          |  6 +++---
 requirements.txt                              |  2 +-
 requirements/docs.txt                         |  2 +-
 tests/models/test_restore.py                  | 21 ++++++++++++++++++-
 8 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5b9b705459510..68941743ed00e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
+
 
 ### Changed
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 655e8dba30a36..2b861623599a6 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -294,10 +294,14 @@ def setup(app):
 # Ignoring Third-party packages
 # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule
 def package_list_from_file(file):
+    """List up package name (not containing version and extras) from a package list file
+    """
     mocked_packages = []
     with open(file, 'r') as fp:
         for ln in fp.readlines():
-            found = [ln.index(ch) for ch in list(',=<>#') if ch in ln]
+            # Example: `tqdm>=4.41.0` => `tqdm`
+            # `[` is for package with extras
+            found = [ln.index(ch) for ch in list(',=<>#[') if ch in ln]
             pkg = ln[:min(found)] if found else ln
             if pkg.rstrip():
                 mocked_packages.append(pkg.rstrip())
diff --git a/environment.yml b/environment.yml
index 3d59c1eeed0dd..1278f15f718e9 100644
--- a/environment.yml
+++ b/environment.yml
@@ -30,7 +30,7 @@ dependencies:
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
-    - fsspec>=0.8.0
+    - fsspec[http]>=0.8.1
     #- tensorboard>=2.2.0  # not needed, already included in pytorch
 
     # Optional
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index e912462d2491b..c71cbe6ce6180 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -43,7 +43,7 @@ def __init__(self, trainer):
         # used to validate checkpointing logic
         self.has_trained = False
 
-    def restore_weights(self, model: LightningModule):
+    def restore_weights(self, model: LightningModule) -> None:
         """
         Attempt to restore a checkpoint (e.g. weights) in this priority:
         1. from HPC weights
@@ -73,11 +73,16 @@ def restore_weights(self, model: LightningModule):
         if self.trainer.on_gpu:
             torch.cuda.empty_cache()
 
-    def restore(self, checkpoint_path: str, on_gpu: bool):
+    def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
         """
         Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore.
         All restored states are listed in return value description of `dump_checkpoint`.
         """
+        # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(checkpoint_path)
+        if not fs.exists(checkpoint_path):
+            rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch")
+            return False
 
         # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
         checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
@@ -94,6 +99,9 @@ def restore(self, checkpoint_path: str, on_gpu: bool):
         # restore training state
         self.restore_training_state(checkpoint)
 
+        rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}")
+        return True
+
     def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         """
         Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 25dffa52dcdab..f2e943d2783af 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -251,9 +251,9 @@ def __init__(
                 train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it,
                 you can set ``replace_sampler_ddp=False`` and add your own distributed sampler.
 
-            resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.
-                This can be a URL. If resuming from mid-epoch checkpoint, training will start from
-                the beginning of the next epoch.
+            resume_from_checkpoint: Path/URL of the checkpoint from which training is resumed. If there is
+                no checkpoint file at the path, start from scratch. If resuming from mid-epoch checkpoint,
+                training will start from the beginning of the next epoch.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 
diff --git a/requirements.txt b/requirements.txt
index 4b8a3efb5c841..2dd5378649851 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,5 @@ future>=0.17.1  # required for builtins in setup.py
 # pyyaml>=3.13
 PyYAML>=5.1  # OmegaConf requirement >=5.1
 tqdm>=4.41.0
-fsspec>=0.8.0
+fsspec[http]>=0.8.1
 tensorboard>=2.2.0
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 0f8f2005b88b1..df596ed2bdda8 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -11,4 +11,4 @@ https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#eg
 sphinx-autodoc-typehints
 sphinx-paramlinks<0.4.0
 sphinx-togglebutton
-sphinx-copybutton
+sphinx-copybutton
\ No newline at end of file
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 17821570bdfa7..f7773f63aa8c2 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -27,7 +27,7 @@
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint
-from tests.base import EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST
+from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -73,6 +73,25 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
     trainer.fit(model)
 
 
+def test_try_resume_from_non_existing_checkpoint(tmpdir):
+    """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
+    model = BoringModel()
+    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        logger=False,
+        callbacks=[checkpoint_cb],
+        limit_train_batches=0.1,
+        limit_val_batches=0.1,
+    )
+    # Generate checkpoint `last.ckpt` with BoringModel
+    trainer.fit(model)
+    # `True` if resume/restore successfully else `False`
+    assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
+    assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
+
+
 class CaptureCallbacksBeforeTraining(Callback):
     callbacks = []