From 86e1d9f75991cb7b1254ec282c474eacb8babb20 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Wed, 7 Apr 2021 04:25:13 -0700 Subject: [PATCH] [fix] Better support for rank_zero_only setting for SLURM and torchelastic (#6802) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 3 ++ pytorch_lightning/utilities/distributed.py | 12 ++++- tests/utilities/test_distributed.py | 56 ++++++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 tests/utilities/test_distributed.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d2629d3928f0..ed9b2d1586ea4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -176,6 +176,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802/)) + + - Sanitize `None` params during pruning ([#6836](https://github.com/PyTorchLightning/pytorch-lightning/pull/6836)) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index bf7a199fc08dc..b4793889f1299 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -44,8 +44,18 @@ def wrapped_fn(*args, **kwargs): return wrapped_fn +# TODO: this should be part of the cluster environment +def _get_rank() -> int: + rank_keys = ('RANK', 'SLURM_PROCID', 'LOCAL_RANK') + for key in rank_keys: + rank = os.environ.get(key) + if rank is not None: + return int(rank) + return 0 + + # add the attribute to the function but don't overwrite in case Trainer has already set it -rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0))) +rank_zero_only.rank = getattr(rank_zero_only, 'rank', _get_rank()) def _warn(*args, **kwargs): diff --git a/tests/utilities/test_distributed.py b/tests/utilities/test_distributed.py new file mode 100644 index 0000000000000..879a1cb9c4cd5 --- /dev/null +++ b/tests/utilities/test_distributed.py @@ -0,0 +1,56 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import Mapping +from unittest import mock + +import pytest + + +@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"SLURM_PROCID": "0"}]) +def test_rank_zero_known_cluster_envs(env_vars: Mapping[str, str]): + """ Test that SLURM environment variables are properly checked for rank_zero_only. """ + from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only + rank_zero_only.rank = _get_rank() + + with mock.patch.dict(os.environ, env_vars): + from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only + rank_zero_only.rank = _get_rank() + + @rank_zero_only + def foo(): # The return type is optional because on non-zero ranks it will not be called + return 1 + + x = foo() + assert x == 1 + + +@pytest.mark.parametrize("rank_key,rank", [ + ("RANK", "1"), + ("SLURM_PROCID", "2"), + ("LOCAL_RANK", "3"), +]) +def test_rank_zero_none_set(rank_key, rank): + """ Test that function is not called when rank environment variables are not global zero. """ + + with mock.patch.dict(os.environ, {rank_key: rank}): + from pytorch_lightning.utilities.distributed import _get_rank, rank_zero_only + rank_zero_only.rank = _get_rank() + + @rank_zero_only + def foo(): + return 1 + + x = foo() + assert x is None