From 86e1d9f75991cb7b1254ec282c474eacb8babb20 Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Wed, 7 Apr 2021 04:25:13 -0700
Subject: [PATCH] [fix] Better support for rank_zero_only setting for SLURM and
 torchelastic (#6802)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 CHANGELOG.md                               |  3 ++
 pytorch_lightning/utilities/distributed.py | 12 ++++-
 tests/utilities/test_distributed.py        | 56 ++++++++++++++++++++++
 3 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 tests/utilities/test_distributed.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d2629d3928f0..ed9b2d1586ea4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -176,6 +176,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/))
+
+
 - Sanitize `None` params during pruning ([#6836](https://github.com/PyTorchLightning/pytorch-lightning/pull/6836))
 
 
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index bf7a199fc08dc..b4793889f1299 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -44,8 +44,18 @@ def wrapped_fn(*args, **kwargs):
     return wrapped_fn
 
 
+# TODO: this should be part of the cluster environment
+def _get_rank() -> int:
+    rank_keys = ('RANK', 'SLURM_PROCID', 'LOCAL_RANK')
+    for key in rank_keys:
+        rank = os.environ.get(key)
+        if rank is not None:
+            return int(rank)
+    return 0
+
+
 # add the attribute to the function but don't overwrite in case Trainer has already set it
-rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))
+rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank())
 
 
 def _warn(*args, **kwargs):
diff --git a/tests/utilities/test_distributed.py b/tests/utilities/test_distributed.py
new file mode 100644
index 0000000000000..879a1cb9c4cd5
--- /dev/null
+++ b/tests/utilities/test_distributed.py
@@ -0,0 +1,56 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Mapping
+from unittest import mock
+
+import pytest
+
+
+@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"SLURM_PROCID": "0"}])
+def test_rank_zero_known_cluster_envs(env_vars: Mapping[str, str]):
+    """ Test that SLURM environment variables are properly checked for rank_zero_only. """
+    from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
+    rank_zero_only.rank = _get_rank()
+
+    with mock.patch.dict(os.environ, env_vars):
+        from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
+        rank_zero_only.rank = _get_rank()
+
+        @rank_zero_only
+        def foo():  # The return type is optional because on non-zero ranks it will not be called
+            return 1
+
+        x = foo()
+        assert x == 1
+
+
+@pytest.mark.parametrize("rank_key,rank", [
+    ("RANK", "1"),
+    ("SLURM_PROCID", "2"),
+    ("LOCAL_RANK", "3"),
+])
+def test_rank_zero_none_set(rank_key, rank):
+    """ Test that function is not called when rank environment variables are not global zero. """
+
+    with mock.patch.dict(os.environ, {rank_key: rank}):
+        from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only
+        rank_zero_only.rank = _get_rank()
+
+        @rank_zero_only
+        def foo():
+            return 1
+
+        x = foo()
+        assert x is None