From 597ae27c9075cffd96f4c44e4026301125a59918 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 12:55:28 +0000 Subject: [PATCH 01/54] resolve bug --- .../callbacks/model_checkpoint.py | 20 ++---- pytorch_lightning/core/step_result.py | 6 ++ pytorch_lightning/plugins/training_type/dp.py | 3 + .../plugins/training_type/parallel.py | 14 ++-- .../plugins/training_type/tpu_spawn.py | 15 ++-- .../training_type/training_type_plugin.py | 6 +- .../logger_connector/epoch_result_store.py | 45 ++++++++++-- pytorch_lightning/trainer/trainer.py | 1 + .../test_checkpoint_callback_frequency.py | 37 ++++++++++ tests/special_tests.sh | 1 + .../logging_/test_eval_loop_logging_1_0.py | 68 +++++++++++++++++++ 11 files changed, 186 insertions(+), 30 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index ec735eb5363a2..92faa42a73862 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -306,7 +306,7 @@ def _save_model(self, filepath: str, trainer, pl_module): else: raise ValueError(".save_function() not set") - def check_monitor_top_k(self, current) -> bool: + def check_monitor_top_k(self, trainer, current: Optional[torch.Tensor]) -> bool: if current is None: return False @@ -326,7 +326,9 @@ def check_monitor_top_k(self, current) -> bool: current = torch.tensor(current) monitor_op = {"min": torch.lt, "max": torch.gt}[self.mode] - return monitor_op(current, self.best_k_models[self.kth_best_model_path]).item() + should_update_best_and_save = monitor_op(current, self.best_k_models[self.kth_best_model_path]) + should_update_best_and_save = trainer.training_type_plugin.reduce_model_checkpoint_decision(should_update_best_and_save) + return should_update_best_and_save @classmethod def _format_checkpoint_name( @@ -523,15 +525,7 @@ def _save_top_k_checkpoints(self, trainer, pl_module, metrics): epoch = metrics.get("epoch") step = metrics.get("step") - # when `val_loss` is being logged and no ModelCheckpoint is being provided - # `val_loss` will be selected for monitor and need to be reduced to - # prevent processes divergence - # TODO: Move this logic to logger_connector. This also needs to be fixed for any - # other monitor logged value which aren't produced from a Metric. - if self.monitor == "val_loss": - current = trainer.training_type_plugin.reduce(current, reduce_op="mean") - - if self.check_monitor_top_k(current): + if self.check_monitor_top_k(trainer, current): self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics) elif self.monitor is not None and self.verbose: rank_zero_info(f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}") @@ -596,5 +590,5 @@ def file_exists(self, filepath: Union[str, Path], trainer) -> bool: the internal state to diverge between ranks. """ exists = self._fs.exists(filepath) - exists = trainer.training_type_plugin.broadcast(exists) - return exists + exists = trainer.lightning_module.all_gather(exists) + return bool(exists[0]) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index f8d7a2ffe3a23..bb13480c35376 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -633,6 +633,12 @@ def rename_keys(self, map_dict: dict): meta[dest] = meta[source] del meta[source] + def get_non_metrics_keys(self): + """ + This function is used to filter metric keys for which the value isn't a Metric + """ + return [k for k, v in self.items() if not isinstance(v, Metric)] + def choose_last(x): if isinstance(x, (torch.Tensor, list)): diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index c2b16303e5d4e..d56ee00b89465 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -68,6 +68,9 @@ def broadcast(self, obj: object, src: int = 0) -> object: def reduce_early_stopping_decision(self, should_stop: bool) -> bool: return should_stop + def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: + return should_update_best_and_save + def training_step(self, *args, **kwargs): return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index f3c825fe9cd7a..d71ef221a93d0 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -77,11 +77,17 @@ def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=len(self.parallel_devices), rank=self.global_rank) return distributed_sampler_kwargs + def reduce_decision(self, decision: bool) -> bool: + decision = torch.tensor(int(decision), device=self.lightning_module.device) + decision = self.reduce(decision, reduce_op=ReduceOp.SUM) + decision = bool(decision == self.world_size) + return decision + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device) - should_stop = self.reduce(should_stop, reduce_op=ReduceOp.SUM) - should_stop = bool(should_stop == self.world_size) - return should_stop + return self.reduce_decision(should_stop) + + def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: + return self.reduce_decision(should_update_best_and_save) @property def torch_distributed_backend(self): diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 9639a17e637bb..a5410277823d8 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -201,12 +201,17 @@ def save_spawn_weights(self, model: LightningModule) -> Optional[str]: model.trainer.save_checkpoint(path) return path + def reduce_decision(self, decision: bool) -> bool: + decision = torch.tensor(int(decision), device=self.device) + decision = self.reduce(decision, "sum") + decision = bool(decision == self.world_size) + return decision + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - should_stop = torch.tensor(int(should_stop), device=self.lightning_module.device) - stop = xm.mesh_reduce('stop_signal', should_stop, sum) - rendezvous("pl.EarlyStoppingCallback.stop_distributed_training_check") - should_stop = int(stop.item()) == self.world_size - return should_stop + return self.reduce_decision(should_stop) + + def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: + return self.reduce_decision(should_update_best_and_save) def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if not isinstance(output, torch.Tensor): diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index cf4b93e04e2dc..2615291a319e0 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -78,9 +78,13 @@ def broadcast(self, obj: object, src: int = 0) -> object: """Broadcasts an object to all processes""" def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - """Reduce the early stopping decision across all possibly spawned processes""" + """Reduce the early stopping decision across all processes""" return should_stop + def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: + """Reduce the model checkpoint decision across all processes""" + return should_update_best_and_save + def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int): """Run before precision plugin executes backward""" diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index a547144c8a6f3..935ec91c6e18e 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -11,14 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pytorch_lightning as pl from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple - +from typing import Any, Callable, Dict, List, Optional, Tuple +from weakref import proxy import torch - +import logging from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import DistributedType, LightningEnum +from pytorch_lightning.utilities.warnings import WarningCache + +log = logging.getLogger(__name__) + + +class MetricWarningCache(WarningCache): + def __init__(self): + super().__init__() + self.warned_metrics = [] + + +warning_cache = MetricWarningCache() class ResultStoreType(LightningEnum): @@ -50,8 +63,9 @@ class HookResultStore: Those data structures enables us to reduce properly Result object when batch loop is finished. """ - def __init__(self, fx_name): + def __init__(self, fx_name: str, all_gather_fn: Callable) -> None: self._fx_name = fx_name + self._all_gather_fn = all_gather_fn self._internals = {} self._internals_reduced = {} self._internal_type = None @@ -104,8 +118,24 @@ def get_batch_log_metrics(self, *args, **kwargs): def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> None: if not isinstance(opt_metric, Result): raise Exception("The provided opt_metric should be a Result Object. Something is wrong") + func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) + + if torch.distributed.is_initialized(): + device = self._all_gather_fn.__self__.device + for non_metric_key in opt_metric.get_non_metrics_keys(): + if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: + metric = self._all_gather_fn(metrics_to_log[non_metric_key].to(device)) + if any(metric[0] != m for m in metric[1:]): + warning_cache.warn( + f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " + "doesn't appear to be the same accross all processes" + "HINT: One could either do: self.log(..., sync_dist=True, sync_fn=torch.mean) to force mean reduction " + "across processes which can be unaccurate or implement a ``pytorch_lightning.metrics.Metric``" + ) + warning_cache.warned_metrics.append(non_metric_key) + results.append(metrics_to_log) def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]: @@ -222,8 +252,8 @@ class EpochResultStore: ``` """ - def __init__(self, trainer, stage): - self.trainer = trainer + def __init__(self, trainer: 'pl.Trainer', stage: RunningStage): + self.trainer = proxy(trainer) self._stage = stage self.reset() @@ -276,7 +306,8 @@ def cache_result(self) -> None: info = self.info fx_name = info["fx_name"] - self._internals.setdefault(fx_name, HookResultStore(fx_name)) + all_gather_fn = self.trainer.lightning_module.all_gather + self._internals.setdefault(fx_name, HookResultStore(fx_name, all_gather_fn)) # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7bfc3d41f9a8d..b950c8bf8a7b8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Trainer to automate the training.""" +import os import logging import warnings from itertools import count diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 6ce1938d3990f..4907983d131a4 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -19,6 +19,7 @@ from pytorch_lightning import callbacks, seed_everything, Trainer from tests.helpers import BoringModel +from tests.helpers.runif import RunIf @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @@ -98,3 +99,39 @@ def training_step(self, batch, batch_idx): # make sure types are correct assert save_mock.call_count == expected + + +@mock.patch('torch.save') +@RunIf(special=True, min_gpus=2) +@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(2, 2, 0.3, 5)]) +def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): + + class TestModel(BoringModel): + + def __init__(self): + super().__init__() + self.last_coeff = 10.0 + + def training_step(self, batch, batch_idx): + local_rank = int(os.getenv("LOCAL_RANK")) + self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) + return super().training_step(batch, batch_idx) + + model = TestModel() + trainer = Trainer( + callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], + default_root_dir=tmpdir, + max_epochs=epochs, + weights_summary=None, + val_check_interval=val_check_interval, + accelerator="ddp", + gpus=2, + limit_train_batches=64, + limit_val_batches=32, + ) + if os.getenv("LOCAL_RANK") == "0": + with pytest.raises(UserWarning, match="The value associated to the key my_loss_epoch: [15.5, 31.0]"): + trainer.fit(model) + assert save_mock.call_count == expected + else: + trainer.fit(model) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index ffb21255a6d3c..4fd6a139561b4 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -32,3 +32,4 @@ python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_trainer_ python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model +python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_ddp diff --git a/tests/trainer/logging_/test_eval_loop_logging_1_0.py b/tests/trainer/logging_/test_eval_loop_logging_1_0.py index e5cf596a78eca..8bc21e905680e 100644 --- a/tests/trainer/logging_/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_/test_eval_loop_logging_1_0.py @@ -925,3 +925,71 @@ def get_metrics_at_idx(idx): } assert set(trainer.callback_metrics) == expected_callback_metrics assert set(results[0]) == {'test_loss', 'debug_epoch'} + + +@mock.patch("pytorch_lightning.trainer.connectors.logger_connector.logger_connector.LoggerConnector.log_metrics") +@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) +def test_validation_step_log_ddp(mock_log_metrics, tmpdir): + """ + This tests make sure we properly log_metrics to loggers + """ + + class ExtendedModel(BoringModel): + + val_losses = [] + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('train_loss', loss) + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.val_losses.append(loss) + self.log('valid_loss_0', loss, on_step=True, on_epoch=True) + self.log('valid_loss_1', loss, on_step=False, on_epoch=True) + self.log('valid_loss_2', loss, on_step=True, on_epoch=False) + self.log('valid_loss_3', loss, on_step=False, on_epoch=False) + self.log('valid_loss_4', loss.clone(), on_step=False, on_epoch=True, sync_dist=True) + return {"val_loss": loss} + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('test_loss', loss) + return {"y": loss} + + model = ExtendedModel() + model.validation_epoch_end = None + + # Initialize a trainer + trainer = Trainer( + default_root_dir=tmpdir, + logger=TensorBoardLogger(tmpdir), + limit_train_batches=2, + limit_val_batches=2, + limit_test_batches=2, + max_epochs=2, + accelerator="ddp", + gpus=2, + progress_bar_refresh_rate=1, + ) + + # Train the model ⚡ + trainer.fit(model) + + def get_metrics(mock_call): + if isinstance(mock_call.kwargs, dict): + return mock_call.kwargs + else: + return mock_call[1][0] + + # hp_metric + 2 steps + epoch + 2 steps + epoch + expected_num_calls = 1 + 2 + 1 + 2 + + assert len(mock_log_metrics.mock_calls) == expected_num_calls + + for mock_call in mock_log_metrics.mock_calls: + metric = get_metrics(mock_call) \ No newline at end of file From ef1192777e19bccfc2ee47f0210b3f4a93eb08bb Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 13:46:15 +0000 Subject: [PATCH 02/54] update --- .../callbacks/model_checkpoint.py | 6 +- .../logger_connector/epoch_result_store.py | 16 +++-- pytorch_lightning/trainer/trainer.py | 1 - tests/callbacks/test_pruning.py | 4 +- .../logging_/test_eval_loop_logging_1_0.py | 68 ------------------- 5 files changed, 17 insertions(+), 78 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 92faa42a73862..c6cbbdaa0465a 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -327,7 +327,11 @@ def check_monitor_top_k(self, trainer, current: Optional[torch.Tensor]) -> bool: monitor_op = {"min": torch.lt, "max": torch.gt}[self.mode] should_update_best_and_save = monitor_op(current, self.best_k_models[self.kth_best_model_path]) - should_update_best_and_save = trainer.training_type_plugin.reduce_model_checkpoint_decision(should_update_best_and_save) + + # If using multiple devices, make sure all processes are unimanious on the decision. + reduce_model_checkpoint_decision_fn = trainer.training_type_plugin.reduce_model_checkpoint_decision + should_update_best_and_save = reduce_model_checkpoint_decision_fn(should_update_best_and_save) + return should_update_best_and_save @classmethod diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 935ec91c6e18e..c9d7acfd36417 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -11,12 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import pytorch_lightning as pl +import logging from collections import defaultdict from typing import Any, Callable, Dict, List, Optional, Tuple from weakref import proxy + import torch -import logging + +import pytorch_lightning as pl from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import DistributedType, LightningEnum @@ -26,6 +28,7 @@ class MetricWarningCache(WarningCache): + def __init__(self): super().__init__() self.warned_metrics = [] @@ -121,7 +124,7 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) - + if torch.distributed.is_initialized(): device = self._all_gather_fn.__self__.device for non_metric_key in opt_metric.get_non_metrics_keys(): @@ -130,9 +133,10 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non if any(metric[0] != m for m in metric[1:]): warning_cache.warn( f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " - "doesn't appear to be the same accross all processes" - "HINT: One could either do: self.log(..., sync_dist=True, sync_fn=torch.mean) to force mean reduction " - "across processes which can be unaccurate or implement a ``pytorch_lightning.metrics.Metric``" + "doesn't appear to be the same accross all processes. " + "HINT: One could either do: self.log(..., sync_dist=True, sync_fn=torch.mean)" + " to force mean reduction across processes which can be unaccurate or implement" + " a ``pytorch_lightning.metrics.Metric``" ) warning_cache.warned_metrics.append(non_metric_key) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b950c8bf8a7b8..7bfc3d41f9a8d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Trainer to automate the training.""" -import os import logging import warnings from itertools import count diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 23b2fcbb52235..0e63fc29d49b1 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from collections import OrderedDict from logging import INFO @@ -22,7 +21,7 @@ from torch.nn import Sequential from pytorch_lightning import seed_everything, Trainer -from pytorch_lightning.callbacks import ModelPruning, ModelCheckpoint +from pytorch_lightning.callbacks import ModelCheckpoint, ModelPruning from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -274,6 +273,7 @@ def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog): seed_everything(0) class TestPruning(ModelPruning): + def on_save_checkpoint(self, trainer, pl_module, checkpoint): super().on_save_checkpoint(trainer, pl_module, checkpoint) assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"] diff --git a/tests/trainer/logging_/test_eval_loop_logging_1_0.py b/tests/trainer/logging_/test_eval_loop_logging_1_0.py index 8bc21e905680e..e5cf596a78eca 100644 --- a/tests/trainer/logging_/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_/test_eval_loop_logging_1_0.py @@ -925,71 +925,3 @@ def get_metrics_at_idx(idx): } assert set(trainer.callback_metrics) == expected_callback_metrics assert set(results[0]) == {'test_loss', 'debug_epoch'} - - -@mock.patch("pytorch_lightning.trainer.connectors.logger_connector.logger_connector.LoggerConnector.log_metrics") -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -def test_validation_step_log_ddp(mock_log_metrics, tmpdir): - """ - This tests make sure we properly log_metrics to loggers - """ - - class ExtendedModel(BoringModel): - - val_losses = [] - - def training_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log('train_loss', loss) - return {"loss": loss} - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.val_losses.append(loss) - self.log('valid_loss_0', loss, on_step=True, on_epoch=True) - self.log('valid_loss_1', loss, on_step=False, on_epoch=True) - self.log('valid_loss_2', loss, on_step=True, on_epoch=False) - self.log('valid_loss_3', loss, on_step=False, on_epoch=False) - self.log('valid_loss_4', loss.clone(), on_step=False, on_epoch=True, sync_dist=True) - return {"val_loss": loss} - - def test_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - self.log('test_loss', loss) - return {"y": loss} - - model = ExtendedModel() - model.validation_epoch_end = None - - # Initialize a trainer - trainer = Trainer( - default_root_dir=tmpdir, - logger=TensorBoardLogger(tmpdir), - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - max_epochs=2, - accelerator="ddp", - gpus=2, - progress_bar_refresh_rate=1, - ) - - # Train the model ⚡ - trainer.fit(model) - - def get_metrics(mock_call): - if isinstance(mock_call.kwargs, dict): - return mock_call.kwargs - else: - return mock_call[1][0] - - # hp_metric + 2 steps + epoch + 2 steps + epoch - expected_num_calls = 1 + 2 + 1 + 2 - - assert len(mock_log_metrics.mock_calls) == expected_num_calls - - for mock_call in mock_log_metrics.mock_calls: - metric = get_metrics(mock_call) \ No newline at end of file From 85b327d7b4bfe15c917679d7144b91b12aade13e Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 13:50:32 +0000 Subject: [PATCH 03/54] update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27e5f4be2d04a..f6d85d976ba4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -101,6 +101,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) +- Fixed `ModelCheckpoint` file_exists using all_gather + add warnings with DDP when non-metrics aren't reduced ([#6345](https://github.com/PyTorchLightning/pytorch-lightning/pull/6345)) + + ## [1.2.1] - 2021-02-23 ### Fixed From 47f0b2cff403ac649be77260132971509d8caa8e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 17:53:56 +0000 Subject: [PATCH 04/54] update PR --- pytorch_lightning/accelerators/accelerator.py | 2 +- .../callbacks/model_checkpoint.py | 6 +- .../plugins/precision/apex_amp.py | 5 +- .../plugins/training_type/horovod.py | 6 +- .../plugins/training_type/parallel.py | 10 +- .../plugins/training_type/single_device.py | 6 +- .../training_type/training_type_plugin.py | 4 + setup.cfg | 12 +- .../data/horovod/train_default_model.py | 8 +- tests/models/test_horovod.py | 119 ++++++++++++------ 10 files changed, 117 insertions(+), 61 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 38fb423d22aa8..01500ddef7fa8 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -404,7 +404,7 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra Return: A tensor of shape (world_size, batch, ...) """ - return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) + return self.training_type_plugin.all_gather(tensor, group=group, sync_grads=sync_grads) def process_dataloader(self, dataloader: Union[Iterable, DataLoader]) -> Union[Iterable, DataLoader]: """Wraps the dataloader if necessary diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 92faa42a73862..747d333c0e7ba 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -591,4 +591,8 @@ def file_exists(self, filepath: Union[str, Path], trainer) -> bool: """ exists = self._fs.exists(filepath) exists = trainer.lightning_module.all_gather(exists) - return bool(exists[0]) + if exists.dim() == 0: + exists = exists.item() + else: + exists = exists[0].item() + return bool(exists) diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py index 75570e453ec1b..d3dd4629691f6 100644 --- a/pytorch_lightning/plugins/precision/apex_amp.py +++ b/pytorch_lightning/plugins/precision/apex_amp.py @@ -45,6 +45,8 @@ def connect(self, model: torch.nn.Module, optimizers: Sequence['Optimizer'], return model, optimizers, lr_schedulers model, optimizers = self.configure_apex(amp, model, list(optimizers), self.amp_level) self.reinit_scheduler_properties(optimizers, lr_schedulers) + for opt in optimizers: + opt.zero_grad() return model, optimizers, lr_schedulers def backward( @@ -167,7 +169,6 @@ def pre_optimizer_step( if not pl_module.automatic_optimization: pl_module.trainer.call_hook("on_after_backward") - + optimizer.step(**kwargs) - return False diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 8fe52190fd7bb..664fdf6321e40 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Any, List, Optional, Union +from typing import Any, List, Optional, Union, Callable import torch import torch.distributed as torch_distrib @@ -159,8 +159,8 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ hvd.join() return hvd.allreduce(tensor, op=reduce_op) - def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): - if group is not None: + def all_gather(self, result: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + if group is not None and group != torch.distributed.group.WORLD: raise ValueError( "Horovod does not support allgather using a subcommunicator at this time. " "Unset `group`." diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index d71ef221a93d0..fd3b417887354 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -15,7 +15,7 @@ import os from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import List, Optional +from typing import List, Optional, Any import torch from torch.nn.parallel import DistributedDataParallel @@ -24,7 +24,7 @@ from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin -from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, ReduceOp +from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, ReduceOp, sync_ddp_if_available class ParallelPlugin(TrainingTypePlugin, ABC): @@ -77,6 +77,10 @@ def distributed_sampler_kwargs(self): distributed_sampler_kwargs = dict(num_replicas=len(self.parallel_devices), rank=self.global_rank) return distributed_sampler_kwargs + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + """Perform a all_gather on all processes """ + return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) + def reduce_decision(self, decision: bool) -> bool: decision = torch.tensor(int(decision), device=self.lightning_module.device) decision = self.reduce(decision, reduce_op=ReduceOp.SUM) @@ -131,7 +135,7 @@ def broadcast(self, obj: object, src: int) -> object: torch.save(obj, buffer) data = bytearray(buffer.getbuffer()) data_tensor = torch.tensor(data).to(self.root_device, dtype=torch.float) - data = all_gather_ddp_if_available(data_tensor) + data = self.all_gather(data_tensor) buffer = io.BytesIO(data.cpu().byte().numpy()) obj = torch.load(buffer) return obj diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index d11ae87bed660..d3b3401b78b07 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Union +from typing import Any, Union, Optional import torch @@ -47,6 +47,10 @@ def reduce(self, tensor: Union[Any, torch.Tensor], *args: Any, **kwargs: Any) -> """ return tensor + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + """Perform a all_gather on all processes """ + return tensor + @property def root_device(self) -> torch.device: return self.device diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 2615291a319e0..3f3c742fd59ca 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -77,6 +77,10 @@ def barrier(self, name: Optional[str] = None) -> None: def broadcast(self, obj: object, src: int = 0) -> object: """Broadcasts an object to all processes""" + @abstractmethod + def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + """Perform a all_gather on all processes """ + def reduce_early_stopping_decision(self, should_stop: bool) -> bool: """Reduce the early stopping decision across all processes""" return should_stop diff --git a/setup.cfg b/setup.cfg index 516504bf59791..395717a446a80 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,19 +44,15 @@ exclude_lines = # The actual coverage for each is 90%+ # *metrics (94%+) are temporarily removed from testing while tests speed up omit = - pytorch_lightning/accelerators/ddp_*.py - pytorch_lightning/accelerators/ddp2_*.py - pytorch_lightning/accelerators/dp_*.py - pytorch_lightning/accelerators/tpu_*.py pytorch_lightning/cluster_environments/*.py pytorch_lightning/utilities/xla_device_utils.py pytorch_lightning/utilities/distributed.py pytorch_lightning/tuner/auto_gpu_select.py # TODO: temporary, until accelerator refactor is finished - pytorch_lightning/accelerators/accelerator.py - pytorch_lightning/plugins/training_type/*.py - pytorch_lightning/plugins/precision/*.py - pytorch_lightning/plugins/base_plugin.py + # pytorch_lightning/accelerators/accelerator.py + #pytorch_lightning/plugins/training_type/*.py + #pytorch_lightning/plugins/precision/*.py + #pytorch_lightning/plugins/base_plugin.py [flake8] diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index d3868cfd979e6..4c2aea4fddd28 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -52,8 +52,14 @@ def run_test_from_config(trainer_options, on_gpu, check_size=True): ckpt_path = trainer_options['weights_save_path'] trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) - model = BoringModel() + class TestModel(BoringModel): + + def training_epoch_end(self, outputs) -> None: + res = self.trainer.training_type_plugin.reduce(torch.tensor(1., device=self.device)) + assert res.sum() == self.trainer.accelerator.world_size + + model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}" diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 636979821b313..c90f94c0fe5c4 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -16,12 +16,13 @@ import shlex import subprocess import sys +from unittest.mock import patch import numpy as np import pytest import torch from sklearn.metrics import accuracy_score - +from torch import optim import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils from pytorch_lightning import Trainer @@ -32,6 +33,7 @@ from tests.helpers import BoringModel from tests.helpers.advanced_models import BasicGAN from tests.helpers.runif import RunIf +from tests import _PROJECT_ROOT if _HOROVOD_AVAILABLE: import horovod @@ -40,20 +42,30 @@ # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py') - def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" num_processes = trainer_options.get('gpus', 2) # for Horovod, we interpret `gpus` to be set per worker trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() + append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' + print(_PROJECT_ROOT, append) cmdline = [ - 'horovodrun', '-np', - str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', + 'horovodrun', + '-np', + str(num_processes), + sys.executable, + '-m', + 'coverage', 'run', '--source', 'pytorch_lightning', append, + TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options)) ] if on_gpu: cmdline += ['--on-gpu'] + env_cmd = "" + for k, v in os.environ.items(): + env_cmd += f"{k}={v} " + print(env_cmd + ' '.join(cmdline)) exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 @@ -109,7 +121,9 @@ def test_horovod_multi_gpu(tmpdir): _run_horovod(trainer_options, on_gpu=True) -@pytest.mark.skip(reason="Horovod has a problem with broadcast when using apex?") # todo +# https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994 +# Check with (tgaddair) on Horovod issues if this feature is needed +@pytest.mark.skip(reason="Horovod currently doesn't work with Apex") # todo @RunIf(min_gpus=2, skip_windows=True, amp_apex=True, horovod_nccl=True) def test_horovod_apex(tmpdir): """Test Horovod with multi-GPU support using apex amp.""" @@ -130,7 +144,6 @@ def test_horovod_apex(tmpdir): _run_horovod(trainer_options, on_gpu=True) -@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") # todo @RunIf(min_gpus=2, skip_windows=True, amp_native=True, horovod_nccl=True) def test_horovod_amp(tmpdir): """Test Horovod with multi-GPU support using native amp.""" @@ -150,6 +163,22 @@ def test_horovod_amp(tmpdir): ) _run_horovod(trainer_options, on_gpu=True) +@RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True) +def test_horovod_gather(tmpdir): + """Test Horovod with multi-GPU support using native amp.""" + trainer_options = dict( + default_root_dir=str(tmpdir), + weights_save_path=str(tmpdir), + gradient_clip_val=1.0, + progress_bar_refresh_rate=0, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.2, + gpus=2, + deterministic=True, + accelerator='horovod', + ) + _run_horovod(trainer_options, on_gpu=True) @RunIf(min_gpus=1, skip_windows=True, horovod_nccl=True) def test_horovod_transfer_batch_to_gpu(tmpdir): @@ -211,8 +240,6 @@ def get_optimizer_params(optimizer): assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) -# TODO: unclear Horovod failure... -@pytest.mark.skip(reason="unclear Horovod failure...") @RunIf(skip_windows=True, horovod=True) def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. @@ -261,8 +288,6 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) -# TODO: unclear Horovod failure... -@pytest.mark.skip(reason="unclear Horovod failure...") @RunIf(skip_windows=True, horovod=True) def test_accuracy_metric_horovod(): num_batches = 10 @@ -289,7 +314,7 @@ def _compute_batch(): metric = Accuracy( compute_on_step=True, dist_sync_on_step=True, - dist_sync_fn=trainer.training_type_plugin.gather_all_tensors, + dist_sync_fn=trainer.training_type_plugin.all_gather, threshold=threshold ) @@ -314,33 +339,45 @@ def _compute_batch(): horovod.run(_compute_batch, np=2) -# @RunIf(skip_windows=True) -# def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): -# model = BoringModel() -# model.configure_optimizers = model.configure_optimizers__multiple_schedulers -# -# num_workers = 8 -# init_lr = hparams.get('learning_rate') * num_workers -# -# with patch('pytorch_lightning.accelerators.legacy.horovod_backend.hvd.size') as mock_hvd_size: -# mock_hvd_size.return_value = 8 -# -# # fit model -# trainer = Trainer( -# default_root_dir=tmpdir, -# max_epochs=1, -# limit_val_batches=0.5, -# limit_train_batches=0.2, -# distributed_backend='horovod' -# ) -# results = trainer.fit(model) -# assert results == 1 -# -# adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups][0] -# adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups][0] -# -# # Called ones after end of epoch with gamma=0.1 -# assert pytest.approx(init_lr * 0.1) == adjusted_lr1 -# -# # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times with gamma=0.1 -# assert pytest.approx(init_lr * 0.1) == adjusted_lr2 +@RunIf(skip_windows=True) +def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx, optimizer_idx): + return super().training_step(batch, batch_idx) + + def configure_optimizers(self): + optimizer1 = optim.Adam(self.parameters(), lr=0.1) + optimizer2 = optim.Adam(self.parameters(), lr=0.1) + lr_scheduler1 = optim.lr_scheduler.StepLR(optimizer1, 1, gamma=0.1) + lr_scheduler2 = optim.lr_scheduler.StepLR(optimizer2, 1, gamma=0.1) + return [optimizer1, optimizer2], [lr_scheduler1, lr_scheduler2] + + model = TestModel() + model.training_epoch_end = None + + num_workers = 8 + init_lr = 0.1 * num_workers + + with patch('horovod.torch.size') as mock_hvd_size: + mock_hvd_size.return_value = 8 + + # fit model + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_val_batches=0.5, + limit_train_batches=0.2, + distributed_backend='horovod' + ) + results = trainer.fit(model) + assert results == 1 + + adjusted_lr1 = [pg['lr'] for pg in trainer.optimizers[0].param_groups][0] + adjusted_lr2 = [pg['lr'] for pg in trainer.optimizers[1].param_groups][0] + + # Called ones after end of epoch with gamma=0.1 + assert pytest.approx(init_lr * 0.1) == adjusted_lr1 + + # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times with gamma=0.1 + assert pytest.approx(init_lr * 0.1) == adjusted_lr2 From 1c33b48cecd2e2d2455586cbf05422fb5334623f Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 17:56:46 +0000 Subject: [PATCH 05/54] Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- .../trainer/connectors/logger_connector/epoch_result_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index c9d7acfd36417..00d13c00bfa04 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -134,9 +134,9 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non warning_cache.warn( f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " "doesn't appear to be the same accross all processes. " - "HINT: One could either do: self.log(..., sync_dist=True, sync_fn=torch.mean)" + "HINT: One could either do: `self.log(..., sync_dist=True, sync_fn=torch.mean)`" " to force mean reduction across processes which can be unaccurate or implement" - " a ``pytorch_lightning.metrics.Metric``" + " a `pytorch_lightning.metrics.Metric`" ) warning_cache.warned_metrics.append(non_metric_key) From 6cd47139fcc27fa8a363f0cf9938472ea32156c1 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 18:03:26 +0000 Subject: [PATCH 06/54] add todo --- .../connectors/logger_connector/epoch_result_store.py | 1 + tests/checkpointing/test_checkpoint_callback_frequency.py | 4 ---- tests/models/test_horovod.py | 5 ----- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index c9d7acfd36417..7ca3a94d6add8 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -126,6 +126,7 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) if torch.distributed.is_initialized(): + # todo (tchaton) Resolve all_gather to apply on tensors. device = self._all_gather_fn.__self__.device for non_metric_key in opt_metric.get_non_metrics_keys(): if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 4907983d131a4..76e69775ef546 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -108,10 +108,6 @@ def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): - def __init__(self): - super().__init__() - self.last_coeff = 10.0 - def training_step(self, batch, batch_idx): local_rank = int(os.getenv("LOCAL_RANK")) self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index c90f94c0fe5c4..18b1788b21b30 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -49,7 +49,6 @@ def _run_horovod(trainer_options, on_gpu=False): trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' - print(_PROJECT_ROOT, append) cmdline = [ 'horovodrun', '-np', @@ -62,10 +61,6 @@ def _run_horovod(trainer_options, on_gpu=False): ] if on_gpu: cmdline += ['--on-gpu'] - env_cmd = "" - for k, v in os.environ.items(): - env_cmd += f"{k}={v} " - print(env_cmd + ' '.join(cmdline)) exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy()) assert exit_code == 0 From b58d7fbb1d10df4c4cc3fc24e49a857101dc602e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 18:17:08 +0000 Subject: [PATCH 07/54] resolve issues --- pytorch_lightning/core/lightning.py | 2 +- .../connectors/logger_connector/epoch_result_store.py | 4 +--- pytorch_lightning/utilities/apply_func.py | 7 +++++++ tests/checkpointing/test_checkpoint_callback_frequency.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 98e58f86dc71a..6913a99b8c226 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -38,7 +38,7 @@ from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, ModelIO, PRIMITIVE_TYPES from pytorch_lightning.core.step_result import Result from pytorch_lightning.utilities import rank_zero_warn -from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors +from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors, move_data_to_device from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 9d96a7f96d235..7a2d2dc7c504b 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -126,11 +126,9 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) if torch.distributed.is_initialized(): - # todo (tchaton) Resolve all_gather to apply on tensors. - device = self._all_gather_fn.__self__.device for non_metric_key in opt_metric.get_non_metrics_keys(): if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: - metric = self._all_gather_fn(metrics_to_log[non_metric_key].to(device)) + metric = self._all_gather_fn(metrics_to_log[non_metric_key]) if any(metric[0] != m for m in metric[1:]): warning_cache.warn( f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 0599cccec83be..b485219fde0d3 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -166,4 +166,11 @@ def convert_to_tensors(data, device: torch.device = None): raise MisconfigurationException("device (torch.device) should be provided.") for src_dtype, conversion_func in CONVERSION_DTYPES: data = apply_to_collection(data, src_dtype, partial(conversion_func, device=device)) + + def move_to_device(value, device=None): + if device is not None: + value = value.to(device) + return value.contiguous() + + data = apply_to_collection(data, torch.Tensor, partial(move_to_device, device=device)) return data diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 76e69775ef546..f8a19ed34073b 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -103,7 +103,7 @@ def training_step(self, batch, batch_idx): @mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(2, 2, 0.3, 5)]) +@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): From e3a084a9c27922437e9ec50f1ed3dc12bf9c3657 Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 18:20:16 +0000 Subject: [PATCH 08/54] resolve flake8 --- pytorch_lightning/accelerators/accelerator.py | 1 - pytorch_lightning/core/lightning.py | 2 +- .../plugins/training_type/horovod.py | 6 ++++-- .../models/data/horovod/train_default_model.py | 5 +++-- tests/models/test_horovod.py | 17 +++++++++-------- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 01500ddef7fa8..4660c71cfbb62 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -21,7 +21,6 @@ from pytorch_lightning.plugins.precision import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin, PrecisionPlugin from pytorch_lightning.plugins.training_type import TrainingTypePlugin from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available from pytorch_lightning.utilities.enums import AMPType, LightningEnum if TYPE_CHECKING: diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 6913a99b8c226..98e58f86dc71a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -38,7 +38,7 @@ from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, ModelIO, PRIMITIVE_TYPES from pytorch_lightning.core.step_result import Result from pytorch_lightning.utilities import rank_zero_warn -from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors, move_data_to_device +from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 664fdf6321e40..39594f771e0b3 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Any, List, Optional, Union, Callable +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -159,7 +159,9 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ hvd.join() return hvd.allreduce(tensor, op=reduce_op) - def all_gather(self, result: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: + def all_gather( + self, result: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False + ) -> torch.Tensor: if group is not None and group != torch.distributed.group.WORLD: raise ValueError( "Horovod does not support allgather using a subcommunicator at this time. " diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 4c2aea4fddd28..5393b76c7e9eb 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -21,6 +21,8 @@ import os import sys +import torch + # this is needed because Conda does not use `PYTHONPATH` env var while pip and virtualenv do PYTHONPATH = os.getenv('PYTHONPATH', '') if ':' in PYTHONPATH: @@ -53,12 +55,11 @@ def run_test_from_config(trainer_options, on_gpu, check_size=True): trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)]) class TestModel(BoringModel): - + def training_epoch_end(self, outputs) -> None: res = self.trainer.training_type_plugin.reduce(torch.tensor(1., device=self.device)) assert res.sum() == self.trainer.accelerator.world_size - model = TestModel() trainer = Trainer(**trainer_options) trainer.fit(model) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 18b1788b21b30..41a5c651e6f53 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -23,6 +23,7 @@ import torch from sklearn.metrics import accuracy_score from torch import optim + import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils from pytorch_lightning import Trainer @@ -30,10 +31,10 @@ from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _HOROVOD_AVAILABLE +from tests import _PROJECT_ROOT from tests.helpers import BoringModel from tests.helpers.advanced_models import BasicGAN from tests.helpers.runif import RunIf -from tests import _PROJECT_ROOT if _HOROVOD_AVAILABLE: import horovod @@ -42,6 +43,7 @@ # This script will run the actual test model training in parallel TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py') + def _run_horovod(trainer_options, on_gpu=False): """Execute the training script across multiple workers in parallel.""" num_processes = trainer_options.get('gpus', 2) @@ -50,12 +52,8 @@ def _run_horovod(trainer_options, on_gpu=False): tutils.reset_seed() append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' cmdline = [ - 'horovodrun', - '-np', - str(num_processes), - sys.executable, - '-m', - 'coverage', 'run', '--source', 'pytorch_lightning', append, + 'horovodrun', '-np', + str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options)) ] @@ -117,7 +115,7 @@ def test_horovod_multi_gpu(tmpdir): # https://discuss.pytorch.org/t/torch-cuda-amp-vs-nvidia-apex/74994 -# Check with (tgaddair) on Horovod issues if this feature is needed +# Check with (tgaddair) on Horovod issues if this feature is needed @pytest.mark.skip(reason="Horovod currently doesn't work with Apex") # todo @RunIf(min_gpus=2, skip_windows=True, amp_apex=True, horovod_nccl=True) def test_horovod_apex(tmpdir): @@ -158,6 +156,7 @@ def test_horovod_amp(tmpdir): ) _run_horovod(trainer_options, on_gpu=True) + @RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True) def test_horovod_gather(tmpdir): """Test Horovod with multi-GPU support using native amp.""" @@ -175,6 +174,7 @@ def test_horovod_gather(tmpdir): ) _run_horovod(trainer_options, on_gpu=True) + @RunIf(min_gpus=1, skip_windows=True, horovod_nccl=True) def test_horovod_transfer_batch_to_gpu(tmpdir): @@ -336,6 +336,7 @@ def _compute_batch(): @RunIf(skip_windows=True) def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): + class TestModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx): From 77edbedc04ed55f36cf2f891ea94498e133fbdac Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 18:21:14 +0000 Subject: [PATCH 09/54] update --- pytorch_lightning/plugins/precision/apex_amp.py | 2 +- pytorch_lightning/plugins/training_type/parallel.py | 4 ++-- pytorch_lightning/utilities/apply_func.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py index d3dd4629691f6..0e21d40dda917 100644 --- a/pytorch_lightning/plugins/precision/apex_amp.py +++ b/pytorch_lightning/plugins/precision/apex_amp.py @@ -169,6 +169,6 @@ def pre_optimizer_step( if not pl_module.automatic_optimization: pl_module.trainer.call_hook("on_after_backward") - + optimizer.step(**kwargs) return False diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index fd3b417887354..cf645bb58ca42 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -15,7 +15,7 @@ import os from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import List, Optional, Any +from typing import Any, List, Optional import torch from torch.nn.parallel import DistributedDataParallel @@ -24,7 +24,7 @@ from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin -from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, ReduceOp, sync_ddp_if_available +from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available, ReduceOp class ParallelPlugin(TrainingTypePlugin, ABC): diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index b485219fde0d3..2cfbd303dd615 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -166,11 +166,11 @@ def convert_to_tensors(data, device: torch.device = None): raise MisconfigurationException("device (torch.device) should be provided.") for src_dtype, conversion_func in CONVERSION_DTYPES: data = apply_to_collection(data, src_dtype, partial(conversion_func, device=device)) - + def move_to_device(value, device=None): if device is not None: value = value.to(device) return value.contiguous() - + data = apply_to_collection(data, torch.Tensor, partial(move_to_device, device=device)) return data From 6bcc88d91fb3ac6e056976c5da84a4deeaaa03a5 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 18:33:23 +0000 Subject: [PATCH 10/54] add coverage for reduce --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- tests/models/data/horovod/train_default_model.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 0e683d622f0e7..2741a686599cd 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -595,7 +595,7 @@ def file_exists(self, filepath: Union[str, Path], trainer) -> bool: """ exists = self._fs.exists(filepath) exists = trainer.lightning_module.all_gather(exists) - if exists.dim() == 0: + if isinstance(exists, torch.Tensor) and exists.dim() == 0: exists = exists.item() else: exists = exists[0].item() diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index 5393b76c7e9eb..46ab64afccb03 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -57,8 +57,8 @@ def run_test_from_config(trainer_options, on_gpu, check_size=True): class TestModel(BoringModel): def training_epoch_end(self, outputs) -> None: - res = self.trainer.training_type_plugin.reduce(torch.tensor(1., device=self.device)) - assert res.sum() == self.trainer.accelerator.world_size + res = self.trainer.training_type_plugin.reduce(torch.tensor(1., device=self.device), reduce_op="sum") + assert res.sum() == self.trainer.training_type_plugin.world_size model = TestModel() trainer = Trainer(**trainer_options) From c63bca59d0a63d03499330651174a0d31889066f Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 20:15:45 +0000 Subject: [PATCH 11/54] wip --- .../test_checkpoint_callback_frequency.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index f8a19ed34073b..32eaff73394d2 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from pytorch_lightning.core.lightning import LightningModule from unittest import mock import pytest @@ -103,7 +104,7 @@ def training_step(self, batch, batch_idx): @mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) +@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1)]) def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): @@ -111,8 +112,20 @@ class TestModel(BoringModel): def training_step(self, batch, batch_idx): local_rank = int(os.getenv("LOCAL_RANK")) self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) + return super().training_step(batch, batch_idx) + def assert_broadcast(self, obj): + _obj = self.trainer.accelerator.broadcast(obj) + if os.getenv("LOCAL_RANK") == "1": + assert _obj == obj + + def training_epoch_end(self, outputs) -> None: + rank = os.getenv("LOCAL_RANK", '0') + self.assert_broadcast(rank) + #self.assert_broadcast(True, True) + + model = TestModel() trainer = Trainer( callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], From e26d301debecf3ec4cfaae2726ec28df19178924 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 20:46:45 +0000 Subject: [PATCH 12/54] restore back to brodbact --- .../callbacks/model_checkpoint.py | 7 +- pytorch_lightning/distributed/dist.py | 165 ++++++++++++++---- test.py | 42 +++++ .../test_checkpoint_callback_frequency.py | 19 +- 4 files changed, 180 insertions(+), 53 deletions(-) create mode 100644 test.py diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 2741a686599cd..f833f7b368a9a 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -594,9 +594,4 @@ def file_exists(self, filepath: Union[str, Path], trainer) -> bool: the internal state to diverge between ranks. """ exists = self._fs.exists(filepath) - exists = trainer.lightning_module.all_gather(exists) - if isinstance(exists, torch.Tensor) and exists.dim() == 0: - exists = exists.item() - else: - exists = exists[0].item() - return bool(exists) + return trainer.training_type_plugin.broadcast(exists) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 5da7dfa86084d..0b5bc1d86ca32 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -11,52 +11,141 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import io from typing import Any - +import pickle import torch -from torch import distributed as torch_distrib +from torch.distributed.distributed_c10d import ( + get_rank, _rank_not_in_group, get_backend, + Backend, broadcast, GroupMember +) -from pytorch_lightning.utilities import _GROUP_AVAILABLE +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 +def _object_to_tensor(obj): + buffer = pickle.dumps(obj) + byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] + byte_tensor = torch.ByteTensor(byte_storage) + local_size = torch.LongTensor([byte_tensor.numel()]) + return byte_tensor, local_size -WORLD = None -if _GROUP_AVAILABLE: - from torch.distributed import group - WORLD = group.WORLD +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py +def _tensor_to_object(tensor, tensor_size): + buf = tensor.numpy().tobytes()[:tensor_size] + out = pickle.loads(buf) + return out -class LightningDistributed: +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 +def broadcast_object_list(object_list, src=0, group=None): + """ + Broadcasts picklable objects in ``object_list`` to the whole group. Similar + to :func:`broadcast`, but Python objects can be passed in. + Note that all objects in ``object_list`` must be picklable in order to be + broadcasted. + + Args: + object_list (List[Any]): List of input objects to broadcast. + Each object must be picklable. Only objects on the ``src`` rank will + be broadcast, but each rank must provide lists of equal sizes. + src (int): Source rank from which to broadcast ``object_list``. + group: (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. Default is ``None``. + + Returns: + ``None``. If rank is part of the group, ``object_list`` will contain the + broadcasted objects from ``src`` rank. + + .. note:: For NCCL-based processed groups, internal tensor representations + of objects must be moved to the GPU device before communication takes + place. In this case, the device used is given by + ``torch.cuda.current_device()`` and it is the user's responsiblity to + ensure that this is set so that each rank has an individual GPU, via + ``torch.cuda.set_device()``. + + .. note:: Note that this API differs slightly from the :func:`all_gather` + collective since it does not provide an ``async_op`` handle and thus + will be a blocking call. + + .. warning:: + :func:`broadcast_object_list` uses ``pickle`` module implicitly, which + is known to be insecure. It is possible to construct malicious pickle + data which will execute arbitrary code during unpickling. Only call this + function with data you trust. + + Example:: + >>> # Note: Process group initialization omitted on each rank. + >>> import torch.distributed as dist + >>> if dist.get_rank() == 0: + >>> # Assumes world_size of 3. + >>> objects = ["foo", 12, {1: 2}] # any picklable object + >>> else: + >>> objects = [None, None, None] + >>> dist.broadcast_object_list(objects, src=0) + >>> broadcast_objects + ['foo', 12, {1: 2}] + """ + if _rank_not_in_group(group): + return + + my_rank = get_rank() + # Serialize object_list elements to tensors on src rank. + if my_rank == src: + tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) + object_sizes_tensor = torch.cat(size_list) + else: + object_sizes_tensor = torch.LongTensor(len(object_list)) + + group_backend = get_backend(group) + is_nccl_backend = group_backend == Backend.NCCL + current_device = torch.device("cpu") + if is_nccl_backend: + # See note about using torch.cuda.current_device() here in docstring. + # We cannot simply use my_rank since rank == device is not necessarily + # true. + current_device = torch.device('cuda', torch.cuda.current_device()) + object_sizes_tensor = object_sizes_tensor.to(current_device) + object_sizes_tensor = object_sizes_tensor.to(current_device) + + # Broadcast object sizes + broadcast(object_sizes_tensor, src=src, group=group) + + # Concatenate and broadcast serialized object tensors + if my_rank == src: + object_tensor = torch.cat(tensor_list) + else: + object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) + + if is_nccl_backend: + object_tensor = object_tensor.to(current_device) + broadcast(object_tensor, src=src, group=group) + # Deserialize objects using their stored sizes. + offset = 0 + if my_rank != src: + for i, obj_size in enumerate(object_sizes_tensor): + obj_view = object_tensor[offset : offset + obj_size] + obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] + offset += obj_size + object_list[i] = _tensor_to_object(obj_view, obj_size) + + +class LightningDistributed: + def __init__(self, rank=None, device=None): self.rank = rank self.device = device - - def broadcast(self, obj: Any, group=WORLD): - if self.rank == 0: - self._emit(obj, group) - else: - obj = self._receive(group) - return obj - - def _broadcast(self, tensor, src=0, group=WORLD): - if group is None: - return torch_distrib.broadcast(tensor, src=src) - return torch_distrib.broadcast(tensor, src=0, group=group) - - def _emit(self, obj: Any, group=WORLD): - buffer = io.BytesIO() - torch.save(obj, buffer) - data = bytearray(buffer.getbuffer()) - length_tensor = torch.tensor([len(data)]).long().to(self.device) - self._broadcast(length_tensor, src=0, group=group) - data_tensor = torch.ByteTensor(data).to(self.device) - self._broadcast(data_tensor, src=0, group=group) - - def _receive(self, group=WORLD): - length_tensor = torch.tensor([0]).long().to(self.device) - self._broadcast(length_tensor, src=0, group=group) - data_tensor = torch.empty([length_tensor.item()], dtype=torch.uint8).to(self.device) - self._broadcast(data_tensor, src=0, group=group) - buffer = io.BytesIO(data_tensor.cpu().numpy()) - obj = torch.load(buffer) + + def broadcast(self, obj: Any, group=None): + is_list = isinstance(obj, list) + + if not is_list: + obj = [obj] + + if self.rank != 0: + obj = [None for _ in range(len(obj))] + + broadcast_object_list(obj, 0, group=group or GroupMember.WORLD) + + if not is_list: + obj = obj[0] + return obj diff --git a/test.py b/test.py new file mode 100644 index 0000000000000..9310951db9893 --- /dev/null +++ b/test.py @@ -0,0 +1,42 @@ +import os +import torch +from torch.utils.data import Dataset +from pytorch_lightning import LightningModule, Trainer +class RandomDataset(Dataset): + def __init__(self, size, length): + self.len = length + self.data = torch.randn(length, size) + def __getitem__(self, index): + return self.data[index] + def __len__(self): + return self.len +class BoringModel(LightningModule): + def __init__(self): + super().__init__() + self.layer = torch.nn.Linear(32, 2) + def configure_optimizers(self): + return torch.optim.SGD(self.layer.parameters(), lr=0.1) + def forward(self, x): + return self.layer(x) + def training_step(self, batch, batch_idx): + return self(batch).sum() + def training_epoch_end(self, outputs) -> None: + #data = torch.tensor(100) + data = str(self.global_rank) + print("before broadcast", self.global_rank, data) + out = self.trainer.training_type_plugin.broadcast(str(data)) + # data should remain same + assert data == str(self.global_rank) + # out is the broadcast result from rank 0 + assert out == "0" + print(self.global_rank, data, out) +if __name__ == '__main__': + train_data = torch.utils.data.DataLoader(RandomDataset(32, 64)) + model = BoringModel() + trainer = Trainer( + default_root_dir=os.getcwd(), + gpus=2, # num_processes=2, + accelerator="ddp", + fast_dev_run=True + ) + trainer.fit(model, train_data) \ No newline at end of file diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 32eaff73394d2..6a1c2caf827c3 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -104,7 +104,7 @@ def training_step(self, batch, batch_idx): @mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1)]) +@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): @@ -115,15 +115,16 @@ def training_step(self, batch, batch_idx): return super().training_step(batch, batch_idx) - def assert_broadcast(self, obj): - _obj = self.trainer.accelerator.broadcast(obj) - if os.getenv("LOCAL_RANK") == "1": - assert _obj == obj - def training_epoch_end(self, outputs) -> None: - rank = os.getenv("LOCAL_RANK", '0') - self.assert_broadcast(rank) - #self.assert_broadcast(True, True) + #data = torch.tensor(100) + data = str(self.global_rank) + print("before broadcast", self.global_rank, data) + out = self.trainer.training_type_plugin.broadcast(str(data)) + # data should remain same + assert data == str(self.global_rank) + # out is the broadcast result from rank 0 + assert out == "0" + print(self.global_rank, data, out) model = TestModel() From ce239fd735037c0797f71a189a257c0ed41593dd Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 20:47:35 +0000 Subject: [PATCH 13/54] remove test.py --- test.py | 42 ------------------------------------------ 1 file changed, 42 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 9310951db9893..0000000000000 --- a/test.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -import torch -from torch.utils.data import Dataset -from pytorch_lightning import LightningModule, Trainer -class RandomDataset(Dataset): - def __init__(self, size, length): - self.len = length - self.data = torch.randn(length, size) - def __getitem__(self, index): - return self.data[index] - def __len__(self): - return self.len -class BoringModel(LightningModule): - def __init__(self): - super().__init__() - self.layer = torch.nn.Linear(32, 2) - def configure_optimizers(self): - return torch.optim.SGD(self.layer.parameters(), lr=0.1) - def forward(self, x): - return self.layer(x) - def training_step(self, batch, batch_idx): - return self(batch).sum() - def training_epoch_end(self, outputs) -> None: - #data = torch.tensor(100) - data = str(self.global_rank) - print("before broadcast", self.global_rank, data) - out = self.trainer.training_type_plugin.broadcast(str(data)) - # data should remain same - assert data == str(self.global_rank) - # out is the broadcast result from rank 0 - assert out == "0" - print(self.global_rank, data, out) -if __name__ == '__main__': - train_data = torch.utils.data.DataLoader(RandomDataset(32, 64)) - model = BoringModel() - trainer = Trainer( - default_root_dir=os.getcwd(), - gpus=2, # num_processes=2, - accelerator="ddp", - fast_dev_run=True - ) - trainer.fit(model, train_data) \ No newline at end of file From d8f1dc96c05013bcc8ac30ee708c2fbdfd15ea84 Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 20:52:22 +0000 Subject: [PATCH 14/54] resolve flake8 --- pytorch_lightning/distributed/dist.py | 81 +++++-------------- .../test_checkpoint_callback_frequency.py | 8 -- 2 files changed, 20 insertions(+), 69 deletions(-) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 0b5bc1d86ca32..7ff3607ec8e7f 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -11,14 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any import pickle +from typing import Any + import torch from torch.distributed.distributed_c10d import ( - get_rank, _rank_not_in_group, get_backend, - Backend, broadcast, GroupMember + _rank_not_in_group, + Backend, + broadcast, + get_backend, + get_rank, + GroupMember, ) + # Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 def _object_to_tensor(obj): buffer = pickle.dumps(obj) @@ -36,57 +42,10 @@ def _tensor_to_object(tensor, tensor_size): # Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 -def broadcast_object_list(object_list, src=0, group=None): - """ - Broadcasts picklable objects in ``object_list`` to the whole group. Similar - to :func:`broadcast`, but Python objects can be passed in. - Note that all objects in ``object_list`` must be picklable in order to be - broadcasted. - - Args: - object_list (List[Any]): List of input objects to broadcast. - Each object must be picklable. Only objects on the ``src`` rank will - be broadcast, but each rank must provide lists of equal sizes. - src (int): Source rank from which to broadcast ``object_list``. - group: (ProcessGroup, optional): The process group to work on. If None, - the default process group will be used. Default is ``None``. - - Returns: - ``None``. If rank is part of the group, ``object_list`` will contain the - broadcasted objects from ``src`` rank. - - .. note:: For NCCL-based processed groups, internal tensor representations - of objects must be moved to the GPU device before communication takes - place. In this case, the device used is given by - ``torch.cuda.current_device()`` and it is the user's responsiblity to - ensure that this is set so that each rank has an individual GPU, via - ``torch.cuda.set_device()``. - - .. note:: Note that this API differs slightly from the :func:`all_gather` - collective since it does not provide an ``async_op`` handle and thus - will be a blocking call. - - .. warning:: - :func:`broadcast_object_list` uses ``pickle`` module implicitly, which - is known to be insecure. It is possible to construct malicious pickle - data which will execute arbitrary code during unpickling. Only call this - function with data you trust. - - Example:: - >>> # Note: Process group initialization omitted on each rank. - >>> import torch.distributed as dist - >>> if dist.get_rank() == 0: - >>> # Assumes world_size of 3. - >>> objects = ["foo", 12, {1: 2}] # any picklable object - >>> else: - >>> objects = [None, None, None] - >>> dist.broadcast_object_list(objects, src=0) - >>> broadcast_objects - ['foo', 12, {1: 2}] - """ +def _broadcast_object_list(object_list, src=0, group=None): if _rank_not_in_group(group): return - + my_rank = get_rank() # Serialize object_list elements to tensors on src rank. if my_rank == src: @@ -122,30 +81,30 @@ def broadcast_object_list(object_list, src=0, group=None): offset = 0 if my_rank != src: for i, obj_size in enumerate(object_sizes_tensor): - obj_view = object_tensor[offset : offset + obj_size] + obj_view = object_tensor[offset:offset + obj_size] obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] offset += obj_size object_list[i] = _tensor_to_object(obj_view, obj_size) class LightningDistributed: - + def __init__(self, rank=None, device=None): self.rank = rank self.device = device - + def broadcast(self, obj: Any, group=None): is_list = isinstance(obj, list) - + if not is_list: obj = [obj] - + if self.rank != 0: obj = [None for _ in range(len(obj))] - - broadcast_object_list(obj, 0, group=group or GroupMember.WORLD) - + + _broadcast_object_list(obj, 0, group=group or GroupMember.WORLD) + if not is_list: obj = obj[0] - + return obj diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 6a1c2caf827c3..465751f60bb49 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from pytorch_lightning.core.lightning import LightningModule from unittest import mock import pytest @@ -112,20 +111,13 @@ class TestModel(BoringModel): def training_step(self, batch, batch_idx): local_rank = int(os.getenv("LOCAL_RANK")) self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) - return super().training_step(batch, batch_idx) def training_epoch_end(self, outputs) -> None: - #data = torch.tensor(100) data = str(self.global_rank) - print("before broadcast", self.global_rank, data) out = self.trainer.training_type_plugin.broadcast(str(data)) - # data should remain same assert data == str(self.global_rank) - # out is the broadcast result from rank 0 assert out == "0" - print(self.global_rank, data, out) - model = TestModel() trainer = Trainer( From 237bbd2e484176fb9738587a64ac8e11a6b2f206 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 21:46:35 +0000 Subject: [PATCH 15/54] update --- .../trainer/connectors/logger_connector/epoch_result_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 7a2d2dc7c504b..2e00ce66cee7e 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -23,6 +23,7 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import DistributedType, LightningEnum from pytorch_lightning.utilities.warnings import WarningCache +from torch.distributed.distributed_c10d import GroupMember log = logging.getLogger(__name__) @@ -125,7 +126,7 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) - if torch.distributed.is_initialized(): + if torch.distributed.is_initialized() and GroupMember.WORLD.size() > 1: for non_metric_key in opt_metric.get_non_metrics_keys(): if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: metric = self._all_gather_fn(metrics_to_log[non_metric_key]) From 6fbe70d890e6ff50d513bfacc4f82595d98c1a90 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 21:51:05 +0000 Subject: [PATCH 16/54] check world size --- .../trainer/connectors/logger_connector/epoch_result_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 2e00ce66cee7e..5c52f35c9114d 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -23,7 +23,7 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import DistributedType, LightningEnum from pytorch_lightning.utilities.warnings import WarningCache -from torch.distributed.distributed_c10d import GroupMember +from torch.distributed.distributed_c10d import GroupMember, get_world_size log = logging.getLogger(__name__) @@ -126,7 +126,7 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) - if torch.distributed.is_initialized() and GroupMember.WORLD.size() > 1: + if torch.distributed.is_initialized() and get_world_size() > 1: for non_metric_key in opt_metric.get_non_metrics_keys(): if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: metric = self._all_gather_fn(metrics_to_log[non_metric_key]) From 5f25fc5be4570c2c25672e8b038638e03f6800b6 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 21:56:32 +0000 Subject: [PATCH 17/54] resolve test --- tests/checkpointing/test_model_checkpoint.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 48e4a22e1ec05..7434264e3cfd3 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -44,13 +44,13 @@ class LogInTwoMethods(BoringModel): def training_step(self, batch, batch_idx): out = super().training_step(batch, batch_idx) - self.log('early_stop_on', out['loss']) + self.log('early_stop_on', out['loss'], sync_dist=True) return out def validation_epoch_end(self, outputs): outs = torch.stack([x['x'] for x in outputs]).mean() self.log('epoch', self.current_epoch) - self.log('val_acc', outs) + self.log('val_acc', outs, sync_dist=True) @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @@ -348,7 +348,7 @@ def on_train_start(self, trainer, pl_module): torch.save = Mock(wraps=torch.save) def on_save_checkpoint(self, trainer, pl_module, checkpoint): - # expect all ranks to run but only rank 0 will actually write the checkpoint file + # only rank 0 will call ``torch.save`` super().on_save_checkpoint(trainer, pl_module, checkpoint) self.on_save_checkpoint_count += 1 @@ -358,8 +358,7 @@ def on_train_end(self, trainer, pl_module): assert self.best_model_score assert self.on_save_checkpoint_count == self.expected_count if trainer.is_global_zero: - # twice the calls expected because ddp broadcast also uses torch.save - assert torch.save.call_count == self.expected_count * 2 + assert torch.save.call_count == self.expected_count else: assert torch.save.call_count == 0 From 46cf2c6b301008b4d7f8b0fd971f634f2d1e6abc Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 22:16:17 +0000 Subject: [PATCH 18/54] update --- pytorch_lightning/plugins/training_type/single_device.py | 2 ++ tests/models/test_horovod.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index d3b3401b78b07..8c84456eaabc6 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -23,6 +23,8 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device: torch.device): super().__init__() self.device: torch.device = device + self.world_size = 1 + self.local_rank = 0 @property def on_tpu(self) -> bool: diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 41a5c651e6f53..f7298d90cc928 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -283,7 +283,7 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) -@RunIf(skip_windows=True, horovod=True) +@RunIf(skip_windows=True, horovod=True, num_gpus=2) def test_accuracy_metric_horovod(): num_batches = 10 batch_size = 16 From 8523167da13d204564f7ae0816737359d60895b1 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Mar 2021 12:33:24 +0000 Subject: [PATCH 19/54] use pytorch version when defined --- pytorch_lightning/distributed/dist.py | 75 +----------------- pytorch_lightning/distributed/dist_utils.py | 76 +++++++++++++++++++ .../logger_connector/epoch_result_store.py | 2 +- 3 files changed, 80 insertions(+), 73 deletions(-) create mode 100644 pytorch_lightning/distributed/dist_utils.py diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 7ff3607ec8e7f..3af4098d34a9b 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -11,80 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import pickle from typing import Any -import torch -from torch.distributed.distributed_c10d import ( - _rank_not_in_group, - Backend, - broadcast, - get_backend, - get_rank, - GroupMember, -) +from torch.distributed.distributed_c10d import GroupMember - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 -def _object_to_tensor(obj): - buffer = pickle.dumps(obj) - byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] - byte_tensor = torch.ByteTensor(byte_storage) - local_size = torch.LongTensor([byte_tensor.numel()]) - return byte_tensor, local_size - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py -def _tensor_to_object(tensor, tensor_size): - buf = tensor.numpy().tobytes()[:tensor_size] - out = pickle.loads(buf) - return out - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 -def _broadcast_object_list(object_list, src=0, group=None): - if _rank_not_in_group(group): - return - - my_rank = get_rank() - # Serialize object_list elements to tensors on src rank. - if my_rank == src: - tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) - object_sizes_tensor = torch.cat(size_list) - else: - object_sizes_tensor = torch.LongTensor(len(object_list)) - - group_backend = get_backend(group) - is_nccl_backend = group_backend == Backend.NCCL - current_device = torch.device("cpu") - if is_nccl_backend: - # See note about using torch.cuda.current_device() here in docstring. - # We cannot simply use my_rank since rank == device is not necessarily - # true. - current_device = torch.device('cuda', torch.cuda.current_device()) - object_sizes_tensor = object_sizes_tensor.to(current_device) - object_sizes_tensor = object_sizes_tensor.to(current_device) - - # Broadcast object sizes - broadcast(object_sizes_tensor, src=src, group=group) - - # Concatenate and broadcast serialized object tensors - if my_rank == src: - object_tensor = torch.cat(tensor_list) - else: - object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) - - if is_nccl_backend: - object_tensor = object_tensor.to(current_device) - broadcast(object_tensor, src=src, group=group) - # Deserialize objects using their stored sizes. - offset = 0 - if my_rank != src: - for i, obj_size in enumerate(object_sizes_tensor): - obj_view = object_tensor[offset:offset + obj_size] - obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] - offset += obj_size - object_list[i] = _tensor_to_object(obj_view, obj_size) +from pytorch_lightning.distributed.dist_utils import broadcast_object_list class LightningDistributed: @@ -102,7 +33,7 @@ def broadcast(self, obj: Any, group=None): if self.rank != 0: obj = [None for _ in range(len(obj))] - _broadcast_object_list(obj, 0, group=group or GroupMember.WORLD) + broadcast_object_list(obj, 0, group=group or GroupMember.WORLD) if not is_list: obj = obj[0] diff --git a/pytorch_lightning/distributed/dist_utils.py b/pytorch_lightning/distributed/dist_utils.py new file mode 100644 index 0000000000000..d8a0f09432bcd --- /dev/null +++ b/pytorch_lightning/distributed/dist_utils.py @@ -0,0 +1,76 @@ +import pickle + +import torch +from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank + +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 + +# This part is used to provide broadcast support for PyTorch 1.5 and lower. + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 +def _object_to_tensor(obj): + buffer = pickle.dumps(obj) + byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] + byte_tensor = torch.ByteTensor(byte_storage) + local_size = torch.LongTensor([byte_tensor.numel()]) + return byte_tensor, local_size + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py +def _tensor_to_object(tensor, tensor_size): + buf = tensor.numpy().tobytes()[:tensor_size] + out = pickle.loads(buf) + return out + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 +def _broadcast_object_list(object_list, src=0, group=None): + if _rank_not_in_group(group): + return + + my_rank = get_rank() + # Serialize object_list elements to tensors on src rank. + if my_rank == src: + tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) + object_sizes_tensor = torch.cat(size_list) + else: + object_sizes_tensor = torch.LongTensor(len(object_list)) + + group_backend = get_backend(group) + is_nccl_backend = group_backend == Backend.NCCL + current_device = torch.device("cpu") + if is_nccl_backend: + # See note about using torch.cuda.current_device() here in docstring. + # We cannot simply use my_rank since rank == device is not necessarily + # true. + current_device = torch.device('cuda', torch.cuda.current_device()) + object_sizes_tensor = object_sizes_tensor.to(current_device) + object_sizes_tensor = object_sizes_tensor.to(current_device) + + # Broadcast object sizes + broadcast(object_sizes_tensor, src=src, group=group) + + # Concatenate and broadcast serialized object tensors + if my_rank == src: + object_tensor = torch.cat(tensor_list) + else: + object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) + + if is_nccl_backend: + object_tensor = object_tensor.to(current_device) + broadcast(object_tensor, src=src, group=group) + # Deserialize objects using their stored sizes. + offset = 0 + if my_rank != src: + for i, obj_size in enumerate(object_sizes_tensor): + obj_view = object_tensor[offset:offset + obj_size] + obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] + offset += obj_size + object_list[i] = _tensor_to_object(obj_view, obj_size) + + +if _TORCH_GREATER_EQUAL_1_6: + from torch.distributed.distributed_c10d import broadcast_object_list +else: + broadcast_object_list = _broadcast_object_list diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 5c52f35c9114d..56ba3659f3f0d 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -17,13 +17,13 @@ from weakref import proxy import torch +from torch.distributed.distributed_c10d import get_world_size import pytorch_lightning as pl from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import DistributedType, LightningEnum from pytorch_lightning.utilities.warnings import WarningCache -from torch.distributed.distributed_c10d import GroupMember, get_world_size log = logging.getLogger(__name__) From f28f95084704685b0d46f31cbbf1b3e9cfb0397e Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Mar 2021 12:42:47 +0000 Subject: [PATCH 20/54] update on comments --- pytorch_lightning/distributed/dist_utils.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/distributed/dist_utils.py b/pytorch_lightning/distributed/dist_utils.py index d8a0f09432bcd..80e28a3202653 100644 --- a/pytorch_lightning/distributed/dist_utils.py +++ b/pytorch_lightning/distributed/dist_utils.py @@ -1,11 +1,26 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import pickle import torch from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_7 -# This part is used to provide broadcast support for PyTorch 1.5 and lower. +# The code below is a copy / paste from Pytorch Codebase and provide broadcast support for PyTorch 1.6 and lower. +# Credit to the PyTorch Team. +# The original source are at: https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py # Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 @@ -70,7 +85,7 @@ def _broadcast_object_list(object_list, src=0, group=None): object_list[i] = _tensor_to_object(obj_view, obj_size) -if _TORCH_GREATER_EQUAL_1_6: +if _TORCH_GREATER_EQUAL_1_7: from torch.distributed.distributed_c10d import broadcast_object_list else: broadcast_object_list = _broadcast_object_list From 6eae79d1d342e27ad150f0a143f3e15dbe7f70ab Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 5 Mar 2021 15:20:33 +0000 Subject: [PATCH 21/54] update on comments --- pytorch_lightning/callbacks/early_stopping.py | 2 +- .../callbacks/model_checkpoint.py | 3 +- pytorch_lightning/distributed/dist.py | 6 +- pytorch_lightning/distributed/dist_utils.py | 76 ------------------ .../plugins/precision/apex_amp.py | 2 - pytorch_lightning/plugins/training_type/dp.py | 7 +- .../plugins/training_type/horovod.py | 4 +- .../plugins/training_type/parallel.py | 8 +- .../plugins/training_type/tpu_spawn.py | 6 -- .../training_type/training_type_plugin.py | 8 +- .../logger_connector/epoch_result_store.py | 2 +- pytorch_lightning/utilities/distributed.py | 79 +++++++++++++++++++ setup.cfg | 8 +- tests/models/test_horovod.py | 7 +- 14 files changed, 98 insertions(+), 120 deletions(-) delete mode 100644 pytorch_lightning/distributed/dist_utils.py diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index ff5cfeaf1bb96..8f54fab546b0e 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -171,4 +171,4 @@ def _run_early_stopping_check(self, trainer, pl_module): trainer.should_stop = True # stop every ddp process if any world process decides to stop - trainer.should_stop = trainer.training_type_plugin.reduce_early_stopping_decision(trainer.should_stop) + trainer.should_stop = trainer.training_type_plugin.reduce_boolean_decision(trainer.should_stop) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index abee6f7fe6f85..b94e3d9173c27 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -328,8 +328,7 @@ def check_monitor_top_k(self, trainer, current: Optional[torch.Tensor]) -> bool: should_update_best_and_save = monitor_op(current, self.best_k_models[self.kth_best_model_path]) # If using multiple devices, make sure all processes are unimanious on the decision. - reduce_model_checkpoint_decision_fn = trainer.training_type_plugin.reduce_model_checkpoint_decision - should_update_best_and_save = reduce_model_checkpoint_decision_fn(should_update_best_and_save) + should_update_best_and_save = trainer.training_type_plugin.reduce_boolean_decision(should_update_best_and_save) return should_update_best_and_save diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 3af4098d34a9b..deaac9fb401cc 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,9 +13,7 @@ # limitations under the License. from typing import Any -from torch.distributed.distributed_c10d import GroupMember - -from pytorch_lightning.distributed.dist_utils import broadcast_object_list +from pytorch_lightning.utilities.distributed import broadcast_object_list, group class LightningDistributed: @@ -33,7 +31,7 @@ def broadcast(self, obj: Any, group=None): if self.rank != 0: obj = [None for _ in range(len(obj))] - broadcast_object_list(obj, 0, group=group or GroupMember.WORLD) + broadcast_object_list(obj, 0, group=group or group.WORLD) if not is_list: obj = obj[0] diff --git a/pytorch_lightning/distributed/dist_utils.py b/pytorch_lightning/distributed/dist_utils.py deleted file mode 100644 index d8a0f09432bcd..0000000000000 --- a/pytorch_lightning/distributed/dist_utils.py +++ /dev/null @@ -1,76 +0,0 @@ -import pickle - -import torch -from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank - -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 - -# This part is used to provide broadcast support for PyTorch 1.5 and lower. - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 -def _object_to_tensor(obj): - buffer = pickle.dumps(obj) - byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] - byte_tensor = torch.ByteTensor(byte_storage) - local_size = torch.LongTensor([byte_tensor.numel()]) - return byte_tensor, local_size - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py -def _tensor_to_object(tensor, tensor_size): - buf = tensor.numpy().tobytes()[:tensor_size] - out = pickle.loads(buf) - return out - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 -def _broadcast_object_list(object_list, src=0, group=None): - if _rank_not_in_group(group): - return - - my_rank = get_rank() - # Serialize object_list elements to tensors on src rank. - if my_rank == src: - tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) - object_sizes_tensor = torch.cat(size_list) - else: - object_sizes_tensor = torch.LongTensor(len(object_list)) - - group_backend = get_backend(group) - is_nccl_backend = group_backend == Backend.NCCL - current_device = torch.device("cpu") - if is_nccl_backend: - # See note about using torch.cuda.current_device() here in docstring. - # We cannot simply use my_rank since rank == device is not necessarily - # true. - current_device = torch.device('cuda', torch.cuda.current_device()) - object_sizes_tensor = object_sizes_tensor.to(current_device) - object_sizes_tensor = object_sizes_tensor.to(current_device) - - # Broadcast object sizes - broadcast(object_sizes_tensor, src=src, group=group) - - # Concatenate and broadcast serialized object tensors - if my_rank == src: - object_tensor = torch.cat(tensor_list) - else: - object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) - - if is_nccl_backend: - object_tensor = object_tensor.to(current_device) - broadcast(object_tensor, src=src, group=group) - # Deserialize objects using their stored sizes. - offset = 0 - if my_rank != src: - for i, obj_size in enumerate(object_sizes_tensor): - obj_view = object_tensor[offset:offset + obj_size] - obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] - offset += obj_size - object_list[i] = _tensor_to_object(obj_view, obj_size) - - -if _TORCH_GREATER_EQUAL_1_6: - from torch.distributed.distributed_c10d import broadcast_object_list -else: - broadcast_object_list = _broadcast_object_list diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py index 0e21d40dda917..b600eca5e6bc2 100644 --- a/pytorch_lightning/plugins/precision/apex_amp.py +++ b/pytorch_lightning/plugins/precision/apex_amp.py @@ -45,8 +45,6 @@ def connect(self, model: torch.nn.Module, optimizers: Sequence['Optimizer'], return model, optimizers, lr_schedulers model, optimizers = self.configure_apex(amp, model, list(optimizers), self.amp_level) self.reinit_scheduler_properties(optimizers, lr_schedulers) - for opt in optimizers: - opt.zero_grad() return model, optimizers, lr_schedulers def backward( diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 124bb6e317939..7a1f7ac1201c0 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -71,11 +71,8 @@ def barrier(self, *args, **kwargs): def broadcast(self, obj: object, src: int = 0) -> object: return obj - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - return should_stop - - def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: - return should_update_best_and_save + def reduce_boolean_decision(self, decision: bool) -> bool: + return decision def training_step(self, *args, **kwargs): return self.model(*args, **kwargs) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 39594f771e0b3..c2c2eaef5c868 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -21,7 +21,7 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HOROVOD_AVAILABLE -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp +from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, group if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -162,7 +162,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ def all_gather( self, result: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False ) -> torch.Tensor: - if group is not None and group != torch.distributed.group.WORLD: + if group is not None and group != group.WORLD: raise ValueError( "Horovod does not support allgather using a subcommunicator at this time. " "Unset `group`." diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 6920362c721f8..a510a650f62bf 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -74,18 +74,12 @@ def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_gra """Perform a all_gather on all processes """ return all_gather_ddp_if_available(tensor, group=group, sync_grads=sync_grads) - def reduce_decision(self, decision: bool) -> bool: + def reduce_boolean_decision(self, decision: bool) -> bool: decision = torch.tensor(int(decision), device=self.lightning_module.device) decision = self.reduce(decision, reduce_op=ReduceOp.SUM) decision = bool(decision == self.world_size) return decision - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - return self.reduce_decision(should_stop) - - def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: - return self.reduce_decision(should_update_best_and_save) - @property def torch_distributed_backend(self): torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND") diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index a5410277823d8..5e252eeaca19c 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -207,12 +207,6 @@ def reduce_decision(self, decision: bool) -> bool: decision = bool(decision == self.world_size) return decision - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: - return self.reduce_decision(should_stop) - - def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: - return self.reduce_decision(should_update_best_and_save) - def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): if not isinstance(output, torch.Tensor): output = torch.tensor(output, device=self.device) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index 48b2c02d3214a..b9acbde97a9d0 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -81,13 +81,9 @@ def broadcast(self, obj: object, src: int = 0) -> object: def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: """Perform a all_gather on all processes """ - def reduce_early_stopping_decision(self, should_stop: bool) -> bool: + def reduce_boolean_decision(self, decision: bool) -> bool: """Reduce the early stopping decision across all processes""" - return should_stop - - def reduce_model_checkpoint_decision(self, should_update_best_and_save: bool) -> bool: - """Reduce the model checkpoint decision across all processes""" - return should_update_best_and_save + return decision def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool, optimizer: Optimizer, opt_idx: int): """Run before precision plugin executes backward""" diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 56ba3659f3f0d..8e14e1b60c1d3 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -135,7 +135,7 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " "doesn't appear to be the same accross all processes. " "HINT: One could either do: `self.log(..., sync_dist=True, sync_fn=torch.mean)`" - " to force mean reduction across processes which can be unaccurate or implement" + " to force mean reduction across processes which can be inaccurate or implement" " a `pytorch_lightning.metrics.Metric`" ) warning_cache.warned_metrics.append(non_metric_key) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index e797c32bbf917..8c57d3586e8cc 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -17,6 +17,12 @@ import warnings from functools import wraps from typing import Any, Optional, Union +import pickle + +import torch +from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank + +from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 import torch @@ -33,6 +39,79 @@ class group: WORLD = None +# This part is used to provide broadcast support for PyTorch 1.5 and lower. + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 +def _object_to_tensor(obj): + buffer = pickle.dumps(obj) + byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] + byte_tensor = torch.ByteTensor(byte_storage) + local_size = torch.LongTensor([byte_tensor.numel()]) + return byte_tensor, local_size + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py +def _tensor_to_object(tensor, tensor_size): + buf = tensor.numpy().tobytes()[:tensor_size] + out = pickle.loads(buf) + return out + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 +def _broadcast_object_list(object_list, src=0, group=None): + if _rank_not_in_group(group): + return + + my_rank = get_rank() + # Serialize object_list elements to tensors on src rank. + if my_rank == src: + tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) + object_sizes_tensor = torch.cat(size_list) + else: + object_sizes_tensor = torch.LongTensor(len(object_list)) + + group_backend = get_backend(group) + is_nccl_backend = group_backend == Backend.NCCL + current_device = torch.device("cpu") + if is_nccl_backend: + # See note about using torch.cuda.current_device() here in docstring. + # We cannot simply use my_rank since rank == device is not necessarily + # true. + current_device = torch.device('cuda', torch.cuda.current_device()) + object_sizes_tensor = object_sizes_tensor.to(current_device) + object_sizes_tensor = object_sizes_tensor.to(current_device) + + # Broadcast object sizes + broadcast(object_sizes_tensor, src=src, group=group) + + # Concatenate and broadcast serialized object tensors + if my_rank == src: + object_tensor = torch.cat(tensor_list) + else: + object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) + + if is_nccl_backend: + object_tensor = object_tensor.to(current_device) + + broadcast(object_tensor, src=src, group=group) + + # Deserialize objects using their stored sizes. + offset = 0 + if my_rank != src: + for i, obj_size in enumerate(object_sizes_tensor): + obj_view = object_tensor[offset:offset + obj_size] + obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] + offset += obj_size + object_list[i] = _tensor_to_object(obj_view, obj_size) + + +if _TORCH_GREATER_EQUAL_1_6: + from torch.distributed.distributed_c10d import broadcast_object_list +else: + broadcast_object_list = _broadcast_object_list + + def rank_zero_only(fn): @wraps(fn) diff --git a/setup.cfg b/setup.cfg index 65814bb27f79d..97be0b74b8098 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,10 +49,10 @@ omit = pytorch_lightning/utilities/distributed.py pytorch_lightning/tuner/auto_gpu_select.py # TODO: temporary, until accelerator refactor is finished - # pytorch_lightning/accelerators/accelerator.py - #pytorch_lightning/plugins/training_type/*.py - #pytorch_lightning/plugins/precision/*.py - #pytorch_lightning/plugins/base_plugin.py + pytorch_lightning/accelerators/accelerator.py + pytorch_lightning/plugins/training_type/*.py + pytorch_lightning/plugins/precision/*.py + pytorch_lightning/plugins/base_plugin.py [flake8] diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index f7298d90cc928..6f5c9ab2e67df 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -334,7 +334,7 @@ def _compute_batch(): horovod.run(_compute_batch, np=2) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, horovod=True) def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): class TestModel(BoringModel): @@ -355,8 +355,7 @@ def configure_optimizers(self): num_workers = 8 init_lr = 0.1 * num_workers - with patch('horovod.torch.size') as mock_hvd_size: - mock_hvd_size.return_value = 8 + with patch('horovod.torch.size', return_value=8) as mock_hvd_size: # fit model trainer = Trainer( @@ -364,7 +363,7 @@ def configure_optimizers(self): max_epochs=1, limit_val_batches=0.5, limit_train_batches=0.2, - distributed_backend='horovod' + accelerator='horovod' ) results = trainer.fit(model) assert results == 1 From 9448964459bb591a0cdaca48fd18218f1bc6714d Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Mar 2021 15:27:11 +0000 Subject: [PATCH 22/54] flake8 --- pytorch_lightning/distributed/dist.py | 2 +- pytorch_lightning/plugins/training_type/horovod.py | 7 +++++-- pytorch_lightning/utilities/distributed.py | 8 +++----- tests/models/test_horovod.py | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index deaac9fb401cc..b82993bbf2e52 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -22,7 +22,7 @@ def __init__(self, rank=None, device=None): self.rank = rank self.device = device - def broadcast(self, obj: Any, group=None): + def broadcast(self, obj: Any, group=group.WORLD): is_list = isinstance(obj, list) if not is_list: diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index c2c2eaef5c868..bfc920baf805a 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -21,7 +21,7 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin from pytorch_lightning.utilities import _HOROVOD_AVAILABLE -from pytorch_lightning.utilities.distributed import rank_zero_only, ReduceOp, group +from pytorch_lightning.utilities.distributed import group, rank_zero_only, ReduceOp if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -160,7 +160,10 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ return hvd.allreduce(tensor, op=reduce_op) def all_gather( - self, result: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False + self, + result: Union[torch.Tensor], + group: Optional[Any] = group.WORLD, + sync_grads: bool = False ) -> torch.Tensor: if group is not None and group != group.WORLD: raise ValueError( diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 8c57d3586e8cc..bcd8722f84cca 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -14,18 +14,16 @@ import logging import os +import pickle import warnings from functools import wraps from typing import Any, Optional, Union -import pickle import torch from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 -import torch - log = logging.getLogger(__name__) if torch.distributed.is_available(): @@ -93,9 +91,9 @@ def _broadcast_object_list(object_list, src=0, group=None): if is_nccl_backend: object_tensor = object_tensor.to(current_device) - + broadcast(object_tensor, src=src, group=group) - + # Deserialize objects using their stored sizes. offset = 0 if my_rank != src: diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 6f5c9ab2e67df..630c55ed4a4b2 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -355,7 +355,7 @@ def configure_optimizers(self): num_workers = 8 init_lr = 0.1 * num_workers - with patch('horovod.torch.size', return_value=8) as mock_hvd_size: + with patch('horovod.torch.size', return_value=8): # fit model trainer = Trainer( From 1b5c90a1671bfa96daac9710d3bfa191f60476d2 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 5 Mar 2021 15:33:50 +0000 Subject: [PATCH 23/54] resolve bugs --- pytorch_lightning/distributed/dist.py | 2 +- pytorch_lightning/utilities/distributed.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index deaac9fb401cc..b82993bbf2e52 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -22,7 +22,7 @@ def __init__(self, rank=None, device=None): self.rank = rank self.device = device - def broadcast(self, obj: Any, group=None): + def broadcast(self, obj: Any, group=group.WORLD): is_list = isinstance(obj, list) if not is_list: diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 8c57d3586e8cc..ea9049c4f6d11 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -22,7 +22,7 @@ import torch from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank -from pytorch_lightning.utilities import _TORCH_GREATER_EQUAL_1_6 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_6 import torch From c21f1489b8781c741dcfcf165b8841200ceabf80 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 5 Mar 2021 23:38:20 +0000 Subject: [PATCH 24/54] Update CHANGELOG.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c531c4974b54..9f5c908548d99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -86,9 +86,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `ModelCheckpoint` file_exists using all_gather + add warnings with DDP when non-metrics aren't reduced ([#6345](https://github.com/PyTorchLightning/pytorch-lightning/pull/6345)) -- Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324)) - - - Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/PyTorchLightning/pytorch-lightning/pull/6260)) From 94e9aa923d64de5a7ca860f6ff195396aa871ba7 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Mar 2021 23:43:15 +0000 Subject: [PATCH 25/54] update --- pytorch_lightning/distributed/dist.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index b82993bbf2e52..7c5e1d6f84669 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,7 +13,8 @@ # limitations under the License. from typing import Any -from pytorch_lightning.utilities.distributed import broadcast_object_list, group +from pytorch_lightning.utilities.distributed import broadcast_object_list +from pytorch_lightning.utilities.distributed import group as _group class LightningDistributed: @@ -22,7 +23,7 @@ def __init__(self, rank=None, device=None): self.rank = rank self.device = device - def broadcast(self, obj: Any, group=group.WORLD): + def broadcast(self, obj: Any, group=_group.WORLD): is_list = isinstance(obj, list) if not is_list: @@ -31,7 +32,7 @@ def broadcast(self, obj: Any, group=group.WORLD): if self.rank != 0: obj = [None for _ in range(len(obj))] - broadcast_object_list(obj, 0, group=group or group.WORLD) + broadcast_object_list(obj, 0, group=group or _group.WORLD) if not is_list: obj = obj[0] From b260bf6d13caf1ec0661351cebd6f0d33e52e53b Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Mar 2021 23:49:46 +0000 Subject: [PATCH 26/54] update --- tests/models/test_horovod.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 630c55ed4a4b2..e24da81546249 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -31,7 +31,6 @@ from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import _HOROVOD_AVAILABLE -from tests import _PROJECT_ROOT from tests.helpers import BoringModel from tests.helpers.advanced_models import BasicGAN from tests.helpers.runif import RunIf @@ -50,11 +49,12 @@ def _run_horovod(trainer_options, on_gpu=False): # for Horovod, we interpret `gpus` to be set per worker trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() - append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' + # todo: Find why coverage breaks CI. + # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' # noqa E265 + # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, # noqa E265 cmdline = [ 'horovodrun', '-np', - str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, - TEST_SCRIPT, '--trainer-options', + str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options)) ] if on_gpu: From dd60ed145cef1d219cb07a90bba454deea990211 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 5 Mar 2021 23:58:34 +0000 Subject: [PATCH 27/54] update --- pytorch_lightning/utilities/distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index f556f40ed5c53..a45cc9fe1f0e9 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -22,7 +22,7 @@ import torch from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_6 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 log = logging.getLogger(__name__) @@ -104,7 +104,7 @@ def _broadcast_object_list(object_list, src=0, group=None): object_list[i] = _tensor_to_object(obj_view, obj_size) -if _TORCH_GREATER_EQUAL_1_6: +if _TORCH_GREATER_EQUAL_1_7: from torch.distributed.distributed_c10d import broadcast_object_list else: broadcast_object_list = _broadcast_object_list From 45b65f1cf9fc4d07c035227f058b2589c120518f Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 00:18:37 +0000 Subject: [PATCH 28/54] update --- tests/models/test_horovod.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index e24da81546249..f5aadb59c678e 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -203,7 +203,7 @@ def validation_step(self, batch, *args, **kwargs): tpipes.run_model_test_without_loggers(trainer_options, model) -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, horovod=True) def test_horovod_multi_optimizer(tmpdir): model = BasicGAN() @@ -235,6 +235,7 @@ def get_optimizer_params(optimizer): assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) +@patch("torch.save") # need to mock torch.save or we get pickle error @RunIf(skip_windows=True, horovod=True) def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. @@ -276,6 +277,7 @@ def training_epoch_end(self, outputs) -> None: max_epochs=1, log_every_n_steps=1, weights_summary=None, + logger=False ) trainer.fit(model) @@ -283,6 +285,7 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) +@patch("torch.save") # need to mock torch.save or we get pickle error @RunIf(skip_windows=True, horovod=True, num_gpus=2) def test_accuracy_metric_horovod(): num_batches = 10 @@ -298,10 +301,7 @@ def sk_metric(preds, target): target = torch.randint(high=2, size=(num_batches, batch_size)) def _compute_batch(): - trainer = Trainer( - fast_dev_run=True, - accelerator='horovod', - ) + trainer = Trainer(fast_dev_run=True, accelerator='horovod', logger=False) assert isinstance(trainer.accelerator, CPUAccelerator) # TODO: test that we selected the correct training_type_plugin based on horovod flags From dcd6884993b7be1f83e51483ade467dc8a414a14 Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 00:41:05 +0000 Subject: [PATCH 29/54] remove test --- tests/models/test_horovod.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index f5aadb59c678e..4da65cdb39657 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -235,7 +235,7 @@ def get_optimizer_params(optimizer): assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) -@patch("torch.save") # need to mock torch.save or we get pickle error +@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True) def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. @@ -285,7 +285,7 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) -@patch("torch.save") # need to mock torch.save or we get pickle error +@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True, num_gpus=2) def test_accuracy_metric_horovod(): num_batches = 10 From 2e046e80f37d3d54298c24979f2fc8fdf5398a32 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Sat, 6 Mar 2021 12:10:57 +0000 Subject: [PATCH 30/54] update --- pytorch_lightning/utilities/distributed.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index a45cc9fe1f0e9..2e299464d138a 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -20,14 +20,22 @@ from typing import Any, Optional, Union import torch -from torch.distributed.distributed_c10d import _rank_not_in_group, Backend, broadcast, get_backend, get_rank from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 log = logging.getLogger(__name__) if torch.distributed.is_available(): - from torch.distributed import group, ReduceOp + from torch.distributed import ( + group, + ReduceOp, + Backend, + broadcast, + get_backend, + get_rank, + GroupMember + ) + else: class ReduceOp: @@ -39,6 +47,14 @@ class group: # This part is used to provide broadcast support for PyTorch 1.5 and lower. +def _rank_not_in_group(group): + """ + Helper that checks if the current process's rank is not in a given group. + """ + if group is None: + return False + return group == GroupMember.NON_GROUP_MEMBER + # Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 def _object_to_tensor(obj): @@ -104,7 +120,7 @@ def _broadcast_object_list(object_list, src=0, group=None): object_list[i] = _tensor_to_object(obj_view, obj_size) -if _TORCH_GREATER_EQUAL_1_7: +if _TORCH_GREATER_EQUAL_1_7 and torch.distributed.is_available(): from torch.distributed.distributed_c10d import broadcast_object_list else: broadcast_object_list = _broadcast_object_list From 23b2c10b80da82ecf6a86dbe9b27a4e52404a493 Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 12:19:22 +0000 Subject: [PATCH 31/54] resolve flake8 --- pytorch_lightning/utilities/distributed.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 2e299464d138a..6e71ea90686ee 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -26,15 +26,7 @@ log = logging.getLogger(__name__) if torch.distributed.is_available(): - from torch.distributed import ( - group, - ReduceOp, - Backend, - broadcast, - get_backend, - get_rank, - GroupMember - ) + from torch.distributed import Backend, broadcast, get_backend, get_rank, group, GroupMember, ReduceOp else: @@ -46,7 +38,7 @@ class group: # This part is used to provide broadcast support for PyTorch 1.5 and lower. - +# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160 def _rank_not_in_group(group): """ Helper that checks if the current process's rank is not in a given group. From b4c663b3ecd518b0ded128a9b2136494a9f8e547 Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 12:39:27 +0000 Subject: [PATCH 32/54] update --- .../trainer/connectors/logger_connector/epoch_result_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 8e14e1b60c1d3..34d5ec978e2b6 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -17,7 +17,7 @@ from weakref import proxy import torch -from torch.distributed.distributed_c10d import get_world_size +from torch.distributed import get_world_size import pytorch_lightning as pl from pytorch_lightning.core.step_result import Result From 73e83f79ee744341653da2558b3a23dbf34cd0f9 Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 13:05:05 +0000 Subject: [PATCH 33/54] update --- .../trainer/connectors/logger_connector/epoch_result_store.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 34d5ec978e2b6..36555b1fa89a7 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -17,7 +17,6 @@ from weakref import proxy import torch -from torch.distributed import get_world_size import pytorch_lightning as pl from pytorch_lightning.core.step_result import Result @@ -125,8 +124,7 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) - - if torch.distributed.is_initialized() and get_world_size() > 1: + if torch.distributed.is_initialized() and self._all_gather_fn.__self__.trainer.world_size > 1: for non_metric_key in opt_metric.get_non_metrics_keys(): if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: metric = self._all_gather_fn(metrics_to_log[non_metric_key]) From 2eb6db46803466e708fb919431d998e8668154cb Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 13:26:32 +0000 Subject: [PATCH 34/54] update --- .../connectors/logger_connector/epoch_result_store.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 5867641efed21..9b4ab64e970c1 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -124,7 +124,10 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) - if torch.distributed.is_initialized() and self._all_gather_fn.__self__.trainer.world_size > 1: + if ( + torch.distributed.is_available() and torch.distributed.is_initialized() + and self._all_gather_fn.__self__.trainer.world_size > 1 + ): for non_metric_key in opt_metric.get_non_metrics_keys(): if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: metric = self._all_gather_fn(metrics_to_log[non_metric_key]) @@ -253,6 +256,7 @@ class EpochResultStore: epoch_result_store.cache_result() ``` """ + def __init__(self, trainer: 'pl.Trainer') -> None: self.trainer = trainer self.reset() From 060992b0ce1ae2243c0dc3d49d0908e4e6ecffc8 Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 13:37:51 +0000 Subject: [PATCH 35/54] proxy --- .../trainer/connectors/logger_connector/epoch_result_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 9b4ab64e970c1..d1e849f61ce35 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -258,7 +258,7 @@ class EpochResultStore: """ def __init__(self, trainer: 'pl.Trainer') -> None: - self.trainer = trainer + self.trainer = proxy(trainer) self.reset() def __getitem__(self, key: str) -> Any: From 5bad1353e9bef209068a5cb57761be00b9a5f72e Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 14:25:40 +0000 Subject: [PATCH 36/54] update --- tests/special_tests.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 43658721e9226..18dd6d45fba4e 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -19,10 +19,11 @@ python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_ python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic -python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance +# todo (tchaton) tests are hanging in CI. +# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual +# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp +# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic +# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp From 4579842f5f137eddfa415a10864588ae71ecaac6 Mon Sep 17 00:00:00 2001 From: tchaton Date: Sat, 6 Mar 2021 17:58:41 +0000 Subject: [PATCH 37/54] update --- pytorch_lightning/distributed/dist.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 7c5e1d6f84669..3b3e7890b3944 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -24,17 +24,12 @@ def __init__(self, rank=None, device=None): self.device = device def broadcast(self, obj: Any, group=_group.WORLD): - is_list = isinstance(obj, list) - - if not is_list: - obj = [obj] + # always wrap into a list so list can be brodcasted. + obj = [obj] if self.rank != 0: obj = [None for _ in range(len(obj))] broadcast_object_list(obj, 0, group=group or _group.WORLD) - if not is_list: - obj = obj[0] - - return obj + return obj[0] From 80278388f6af0271a483cb913ecc9cf9a923d7bd Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 8 Mar 2021 18:59:20 +0000 Subject: [PATCH 38/54] resolve typo --- pytorch_lightning/callbacks/model_checkpoint.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 39645595bc90f..04298145a8b0e 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -534,7 +534,7 @@ def _save_top_k_checkpoint(self, trainer, monitor_candidates: Dict[str, Any]): step = monitor_candidates.get("step") if self.check_monitor_top_k(trainer, current): - self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics) + self._update_best_and_save(current, epoch, step, trainer, monitor_candidates) elif self.monitor is not None and self.verbose: rank_zero_info(f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}") @@ -551,9 +551,7 @@ def _save_none_monitor_checkpoint(self, trainer, monitor_candidates: Dict[str, A self._save_model(trainer, filepath) if ( - self.save_top_k is None - and self.best_model_path - and self.best_model_path != filepath + self.save_top_k is None and self.best_model_path and self.best_model_path != filepath and trainer.is_global_zero ): self._del_model(self.best_model_path) From aa9a6ca0ecb47d110988dbb5d7eb6363a71f9301 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 8 Mar 2021 19:04:42 +0000 Subject: [PATCH 39/54] prune --- pytorch_lightning/core/step_result.py | 6 ---- .../logger_connector/epoch_result_store.py | 36 ++----------------- pytorch_lightning/utilities/apply_func.py | 7 ---- tests/checkpointing/test_model_checkpoint.py | 4 +-- tests/models/test_horovod.py | 2 +- tests/special_tests.sh | 4 +-- tests/utilities/test_all_gather_grad.py | 1 + 7 files changed, 9 insertions(+), 51 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index bb13480c35376..f8d7a2ffe3a23 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -633,12 +633,6 @@ def rename_keys(self, map_dict: dict): meta[dest] = meta[source] del meta[source] - def get_non_metrics_keys(self): - """ - This function is used to filter metric keys for which the value isn't a Metric - """ - return [k for k, v in self.items() if not isinstance(v, Metric)] - def choose_last(x): if isinstance(x, (torch.Tensor, list)): diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index d1e849f61ce35..a251c400b6f46 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -13,7 +13,7 @@ # limitations under the License. import logging from collections import defaultdict -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from weakref import proxy import torch @@ -22,21 +22,10 @@ from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import DistributedType, LightningEnum -from pytorch_lightning.utilities.warnings import WarningCache log = logging.getLogger(__name__) -class MetricWarningCache(WarningCache): - - def __init__(self): - super().__init__() - self.warned_metrics = [] - - -warning_cache = MetricWarningCache() - - class ResultStoreType(LightningEnum): INSIDE_BATCH_TRAIN_LOOP = "inside_batch_train_loop" OUTSIDE_BATCH_TRAIN_LOOP = "outside_batch_train_loop" @@ -66,9 +55,8 @@ class HookResultStore: Those data structures enables us to reduce properly Result object when batch loop is finished. """ - def __init__(self, fx_name: str, all_gather_fn: Callable) -> None: + def __init__(self, fx_name: str) -> None: self._fx_name = fx_name - self._all_gather_fn = all_gather_fn self._internals = {} self._internals_reduced = {} self._internal_type = None @@ -124,23 +112,6 @@ def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> Non func = getattr(opt_metric, func_name) metrics_to_log = func(*args, add_dataloader_idx=self.has_several_dataloaders, **kwargs) - if ( - torch.distributed.is_available() and torch.distributed.is_initialized() - and self._all_gather_fn.__self__.trainer.world_size > 1 - ): - for non_metric_key in opt_metric.get_non_metrics_keys(): - if non_metric_key in metrics_to_log and non_metric_key not in warning_cache.warned_metrics: - metric = self._all_gather_fn(metrics_to_log[non_metric_key]) - if any(metric[0] != m for m in metric[1:]): - warning_cache.warn( - f"The value associated to the key {non_metric_key}: {metric.cpu().tolist()} " - "doesn't appear to be the same accross all processes. " - "HINT: One could either do: `self.log(..., sync_dist=True, sync_fn=torch.mean)`" - " to force mean reduction across processes which can be inaccurate or implement" - " a `pytorch_lightning.metrics.Metric`" - ) - warning_cache.warned_metrics.append(non_metric_key) - results.append(metrics_to_log) def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]: @@ -310,8 +281,7 @@ def cache_result(self) -> None: info = self.info fx_name = info["fx_name"] - all_gather_fn = self.trainer.lightning_module.all_gather - self._internals.setdefault(fx_name, HookResultStore(fx_name, all_gather_fn)) + self._internals.setdefault(fx_name, HookResultStore(fx_name)) # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py index 60bcd40cd22fe..9fd42008b9d8d 100644 --- a/pytorch_lightning/utilities/apply_func.py +++ b/pytorch_lightning/utilities/apply_func.py @@ -166,11 +166,4 @@ def convert_to_tensors(data, device: torch.device = None): raise MisconfigurationException("device (torch.device) should be provided.") for src_dtype, conversion_func in CONVERSION_DTYPES: data = apply_to_collection(data, src_dtype, partial(conversion_func, device=device)) - - def move_to_device(value, device=None): - if device is not None: - value = value.to(device) - return value.contiguous() - - data = apply_to_collection(data, torch.Tensor, partial(move_to_device, device=device)) return data diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index ea43014af6557..4fb5a6566ae4a 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -45,13 +45,13 @@ class LogInTwoMethods(BoringModel): def training_step(self, batch, batch_idx): out = super().training_step(batch, batch_idx) - self.log('early_stop_on', out['loss'], sync_dist=True) + self.log('early_stop_on', out['loss']) return out def validation_epoch_end(self, outputs): outs = torch.stack([x['x'] for x in outputs]).mean() self.log('epoch', self.current_epoch) - self.log('val_acc', outs, sync_dist=True) + self.log('val_acc', outs) @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 4da65cdb39657..3c8c9b0f36041 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -54,7 +54,7 @@ def _run_horovod(trainer_options, on_gpu=False): # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, # noqa E265 cmdline = [ 'horovodrun', '-np', - str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', TEST_SCRIPT, '--trainer-options', + str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', shlex.quote(json.dumps(trainer_options)) ] if on_gpu: diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 18dd6d45fba4e..52990c996321a 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -21,9 +21,9 @@ python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multig python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp # todo (tchaton) tests are hanging in CI. # python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual -# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp +python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp # python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic -# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance +python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_dp diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index ae3addb4d6a66..6a30fc60fe607 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -58,6 +58,7 @@ def training_epoch_end(self, outputs) -> None: self.training_epoch_end_called = True losses = torch.stack([x["loss"] for x in outputs]) gathered_loss = self.all_gather({ + "losses_tensor": torch.FloatTensor([1, 2, 3]), "losses_np_ndarray": np.array([1, 2, 3]), "losses_bool": [True, False], "losses_float": [0., 1., 2.], From 4b6a6c5932ede38f9827a1c91592569915b75964 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 8 Mar 2021 19:15:40 +0000 Subject: [PATCH 40/54] update parallel --- pytorch_lightning/plugins/training_type/parallel.py | 11 ----------- .../test_checkpoint_callback_frequency.py | 7 ++++--- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index a510a650f62bf..a813bdb58ee68 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import io import os from abc import ABC, abstractmethod from contextlib import contextmanager @@ -116,13 +115,3 @@ def block_backward_sync(self): yield None else: yield None - - def broadcast(self, obj: object, src: int) -> object: - buffer = io.BytesIO() - torch.save(obj, buffer) - data = bytearray(buffer.getbuffer()) - data_tensor = torch.tensor(data).to(self.root_device, dtype=torch.float) - data = self.all_gather(data_tensor) - buffer = io.BytesIO(data.cpu().byte().numpy()) - obj = torch.load(buffer) - return obj diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 465751f60bb49..ca37a2d3ec86a 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -115,9 +115,10 @@ def training_step(self, batch, batch_idx): def training_epoch_end(self, outputs) -> None: data = str(self.global_rank) - out = self.trainer.training_type_plugin.broadcast(str(data)) - assert data == str(self.global_rank) - assert out == "0" + obj = [[data], (data, ), set(data)] + out = self.trainer.training_type_plugin.broadcast(obj) + assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))] + assert out == [['0'], ('0', ), set('0')] model = TestModel() trainer = Trainer( From 4b55c522fbaf06852b4a2e03f540a6b872dbfbe0 Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 8 Mar 2021 19:16:50 +0000 Subject: [PATCH 41/54] update --- tests/utilities/test_all_gather_grad.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py index 6a30fc60fe607..ae3addb4d6a66 100644 --- a/tests/utilities/test_all_gather_grad.py +++ b/tests/utilities/test_all_gather_grad.py @@ -58,7 +58,6 @@ def training_epoch_end(self, outputs) -> None: self.training_epoch_end_called = True losses = torch.stack([x["loss"] for x in outputs]) gathered_loss = self.all_gather({ - "losses_tensor": torch.FloatTensor([1, 2, 3]), "losses_np_ndarray": np.array([1, 2, 3]), "losses_bool": [True, False], "losses_float": [0., 1., 2.], From cbacf484ef6fce8de4cd4210431e750ed909b9ee Mon Sep 17 00:00:00 2001 From: tchaton Date: Mon, 8 Mar 2021 19:23:02 +0000 Subject: [PATCH 42/54] update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d741352d2e23..a8d8da0024da1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -119,7 +119,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324)) -- Fixed `ModelCheckpoint` file_exists using all_gather + add warnings with DDP when non-metrics aren't reduced ([#6345](https://github.com/PyTorchLightning/pytorch-lightning/pull/6345)) +- Fixed broacast to use PyTorch `broadcast_object_list` and add `reduce_decision` ([#6410](https://github.com/PyTorchLightning/pytorch-lightning/pull/6410)) - Fixed `.teardown(stage='fit')` getting called during `trainer.test` ([#6386](https://github.com/PyTorchLightning/pytorch-lightning/pull/6386)) @@ -134,6 +134,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `Trainer` not resetting `lightning_optimizers` when calling `Trainer.fit()` multiple times ([#6372](https://github.com/PyTorchLightning/pytorch-lightning/pull/6372)) + ## [1.2.2] - 2021-03-02 ### Added From 057fbf3ddc89fb38a511822d08f285e3487e0ea4 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 9 Mar 2021 08:31:44 +0000 Subject: [PATCH 43/54] update --- .../callbacks/model_checkpoint.py | 2 +- pytorch_lightning/distributed/dist.py | 4 +- pytorch_lightning/utilities/distributed.py | 86 +---------------- .../utilities/torch_distributed.py | 94 +++++++++++++++++++ 4 files changed, 98 insertions(+), 88 deletions(-) create mode 100644 pytorch_lightning/utilities/torch_distributed.py diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 04298145a8b0e..4cc26f3d81d8f 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -326,7 +326,7 @@ def _do_save(self, trainer, filepath: str): else: raise ValueError(".save_function() not set") - def check_monitor_top_k(self, trainer, current: Optional[torch.Tensor]) -> bool: + def check_monitor_top_k(self, trainer, current: Optional[torch.Tensor] = None) -> bool: if current is None: return False diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index 3b3e7890b3944..d11d5a610883b 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,8 +13,8 @@ # limitations under the License. from typing import Any -from pytorch_lightning.utilities.distributed import broadcast_object_list from pytorch_lightning.utilities.distributed import group as _group +from pytorch_lightning.utilities.torch_distributed import broadcast_object_list class LightningDistributed: @@ -28,7 +28,7 @@ def broadcast(self, obj: Any, group=_group.WORLD): obj = [obj] if self.rank != 0: - obj = [None for _ in range(len(obj))] + obj = [None] * len(obj) broadcast_object_list(obj, 0, group=group or _group.WORLD) diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 6e71ea90686ee..658f349a22215 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -14,19 +14,16 @@ import logging import os -import pickle import warnings from functools import wraps from typing import Any, Optional, Union import torch -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 - log = logging.getLogger(__name__) if torch.distributed.is_available(): - from torch.distributed import Backend, broadcast, get_backend, get_rank, group, GroupMember, ReduceOp + from torch.distributed import group, ReduceOp else: @@ -37,87 +34,6 @@ class group: WORLD = None -# This part is used to provide broadcast support for PyTorch 1.5 and lower. -# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160 -def _rank_not_in_group(group): - """ - Helper that checks if the current process's rank is not in a given group. - """ - if group is None: - return False - return group == GroupMember.NON_GROUP_MEMBER - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 -def _object_to_tensor(obj): - buffer = pickle.dumps(obj) - byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] - byte_tensor = torch.ByteTensor(byte_storage) - local_size = torch.LongTensor([byte_tensor.numel()]) - return byte_tensor, local_size - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py -def _tensor_to_object(tensor, tensor_size): - buf = tensor.numpy().tobytes()[:tensor_size] - out = pickle.loads(buf) - return out - - -# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 -def _broadcast_object_list(object_list, src=0, group=None): - if _rank_not_in_group(group): - return - - my_rank = get_rank() - # Serialize object_list elements to tensors on src rank. - if my_rank == src: - tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) - object_sizes_tensor = torch.cat(size_list) - else: - object_sizes_tensor = torch.LongTensor(len(object_list)) - - group_backend = get_backend(group) - is_nccl_backend = group_backend == Backend.NCCL - current_device = torch.device("cpu") - if is_nccl_backend: - # See note about using torch.cuda.current_device() here in docstring. - # We cannot simply use my_rank since rank == device is not necessarily - # true. - current_device = torch.device('cuda', torch.cuda.current_device()) - object_sizes_tensor = object_sizes_tensor.to(current_device) - object_sizes_tensor = object_sizes_tensor.to(current_device) - - # Broadcast object sizes - broadcast(object_sizes_tensor, src=src, group=group) - - # Concatenate and broadcast serialized object tensors - if my_rank == src: - object_tensor = torch.cat(tensor_list) - else: - object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) - - if is_nccl_backend: - object_tensor = object_tensor.to(current_device) - - broadcast(object_tensor, src=src, group=group) - - # Deserialize objects using their stored sizes. - offset = 0 - if my_rank != src: - for i, obj_size in enumerate(object_sizes_tensor): - obj_view = object_tensor[offset:offset + obj_size] - obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] - offset += obj_size - object_list[i] = _tensor_to_object(obj_view, obj_size) - - -if _TORCH_GREATER_EQUAL_1_7 and torch.distributed.is_available(): - from torch.distributed.distributed_c10d import broadcast_object_list -else: - broadcast_object_list = _broadcast_object_list - - def rank_zero_only(fn): @wraps(fn) diff --git a/pytorch_lightning/utilities/torch_distributed.py b/pytorch_lightning/utilities/torch_distributed.py new file mode 100644 index 0000000000000..67b64c046dc18 --- /dev/null +++ b/pytorch_lightning/utilities/torch_distributed.py @@ -0,0 +1,94 @@ +import logging +import pickle + +import torch + +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_7 + +log = logging.getLogger(__name__) + +if torch.distributed.is_available(): + from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember + +# The code underneath is taken from PyTorch ``torch/distributed/distributed_c10d.py`` +# and enable broadcasting for PyTorch 1.6 and lower. + + +# https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L160 +def _rank_not_in_group(group): + """ + Helper that checks if the current process's rank is not in a given group. + """ + if group is None: + return False + return group == GroupMember.NON_GROUP_MEMBER + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1164 +def _object_to_tensor(obj): + buffer = pickle.dumps(obj) + byte_storage = torch.ByteStorage.from_buffer(buffer) # type: ignore[attr-defined] + byte_tensor = torch.ByteTensor(byte_storage) + local_size = torch.LongTensor([byte_tensor.numel()]) + return byte_tensor, local_size + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py +def _tensor_to_object(tensor, tensor_size): + buf = tensor.numpy().tobytes()[:tensor_size] + out = pickle.loads(buf) + return out + + +# Taken from https://github.com/pytorch/pytorch/blob/1.7/torch/distributed/distributed_c10d.py#L1327 +def _broadcast_object_list(object_list, src=0, group=None): + if _rank_not_in_group(group): + return + + my_rank = get_rank() + # Serialize object_list elements to tensors on src rank. + if my_rank == src: + tensor_list, size_list = zip(*[_object_to_tensor(obj) for obj in object_list]) + object_sizes_tensor = torch.cat(size_list) + else: + object_sizes_tensor = torch.LongTensor(len(object_list)) + + group_backend = get_backend(group) + is_nccl_backend = group_backend == Backend.NCCL + current_device = torch.device("cpu") + if is_nccl_backend: + # See note about using torch.cuda.current_device() here in docstring. + # We cannot simply use my_rank since rank == device is not necessarily + # true. + current_device = torch.device('cuda', torch.cuda.current_device()) + object_sizes_tensor = object_sizes_tensor.to(current_device) + object_sizes_tensor = object_sizes_tensor.to(current_device) + + # Broadcast object sizes + broadcast(object_sizes_tensor, src=src, group=group) + + # Concatenate and broadcast serialized object tensors + if my_rank == src: + object_tensor = torch.cat(tensor_list) + else: + object_tensor = torch.ByteTensor(torch.sum(object_sizes_tensor).item()) + + if is_nccl_backend: + object_tensor = object_tensor.to(current_device) + + broadcast(object_tensor, src=src, group=group) + + # Deserialize objects using their stored sizes. + offset = 0 + if my_rank != src: + for i, obj_size in enumerate(object_sizes_tensor): + obj_view = object_tensor[offset:offset + obj_size] + obj_view = obj_view.type(torch.ByteTensor) # type: ignore[call-overload] + offset += obj_size + object_list[i] = _tensor_to_object(obj_view, obj_size) + + +if _TORCH_GREATER_EQUAL_1_7 and torch.distributed.is_available(): + from torch.distributed.distributed_c10d import broadcast_object_list +else: + broadcast_object_list = _broadcast_object_list From 7cbf38be59d2e3f1adeb4095dcc118753e744f8d Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 9 Mar 2021 16:57:54 +0000 Subject: [PATCH 44/54] try running pipe --- tests/special_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 52990c996321a..ac355c423c859 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -20,9 +20,9 @@ python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp # todo (tchaton) tests are hanging in CI. -# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual +python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp -# python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic +python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_with_wrong_balance python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_collection python ${DEFAULTS} tests/trainer/test_trainer.py::test_trainer_predict_ddp From 690b61f89cc12d7c0db787a846570079ad1f780e Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Mar 2021 11:08:20 +0000 Subject: [PATCH 45/54] Update pytorch_lightning/callbacks/model_checkpoint.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 4cc26f3d81d8f..e341a55881271 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -535,7 +535,7 @@ def _save_top_k_checkpoint(self, trainer, monitor_candidates: Dict[str, Any]): if self.check_monitor_top_k(trainer, current): self._update_best_and_save(current, epoch, step, trainer, monitor_candidates) - elif self.monitor is not None and self.verbose: + elif self.verbose: rank_zero_info(f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}") def _save_none_monitor_checkpoint(self, trainer, monitor_candidates: Dict[str, Any]): From 300a63278ff68240ba5b78fe3f16e4e7a5e1867b Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Mar 2021 11:12:46 +0000 Subject: [PATCH 46/54] update on comments --- .../trainer/connectors/logger_connector/epoch_result_store.py | 3 --- tests/special_tests.sh | 1 - 2 files changed, 4 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index a251c400b6f46..7759c8028d325 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import logging from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple from weakref import proxy @@ -23,8 +22,6 @@ from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import DistributedType, LightningEnum -log = logging.getLogger(__name__) - class ResultStoreType(LightningEnum): INSIDE_BATCH_TRAIN_LOOP = "inside_batch_train_loop" diff --git a/tests/special_tests.sh b/tests/special_tests.sh index ac355c423c859..43658721e9226 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -19,7 +19,6 @@ python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_ python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp -# todo (tchaton) tests are hanging in CI. python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual_amp python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_automatic From 5e3037775fbfb25b693a346fee427f6567c5bd37 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 11 Mar 2021 19:29:32 +0000 Subject: [PATCH 47/54] Update pytorch_lightning/callbacks/model_checkpoint.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index e341a55881271..afeffc78607d3 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -348,7 +348,7 @@ def check_monitor_top_k(self, trainer, current: Optional[torch.Tensor] = None) - monitor_op = {"min": torch.lt, "max": torch.gt}[self.mode] should_update_best_and_save = monitor_op(current, self.best_k_models[self.kth_best_model_path]) - # If using multiple devices, make sure all processes are unimanious on the decision. + # If using multiple devices, make sure all processes are unanimous on the decision. should_update_best_and_save = trainer.training_type_plugin.reduce_boolean_decision(should_update_best_and_save) return should_update_best_and_save From 015fbac6e3e4c06a2c553b3c29ed2f5a7b15c63f Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 12 Mar 2021 08:20:56 +0000 Subject: [PATCH 48/54] update on comennts --- pytorch_lightning/distributed/dist.py | 2 +- .../{utilities => overrides}/torch_distributed.py | 0 pytorch_lightning/plugins/training_type/parallel.py | 2 -- pytorch_lightning/plugins/training_type/single_device.py | 4 +--- 4 files changed, 2 insertions(+), 6 deletions(-) rename pytorch_lightning/{utilities => overrides}/torch_distributed.py (100%) diff --git a/pytorch_lightning/distributed/dist.py b/pytorch_lightning/distributed/dist.py index d11d5a610883b..37ac5d8b13462 100644 --- a/pytorch_lightning/distributed/dist.py +++ b/pytorch_lightning/distributed/dist.py @@ -13,8 +13,8 @@ # limitations under the License. from typing import Any +from pytorch_lightning.overrides.torch_distributed import broadcast_object_list from pytorch_lightning.utilities.distributed import group as _group -from pytorch_lightning.utilities.torch_distributed import broadcast_object_list class LightningDistributed: diff --git a/pytorch_lightning/utilities/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py similarity index 100% rename from pytorch_lightning/utilities/torch_distributed.py rename to pytorch_lightning/overrides/torch_distributed.py diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index a813bdb58ee68..db0ed4a65d371 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -35,8 +35,6 @@ def __init__( ): super().__init__() self.parallel_devices = parallel_devices - self.world_size = 1 - self.local_rank = 0 self.cluster_environment = cluster_environment @property diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 8c84456eaabc6..9a54309c9a980 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Union, Optional +from typing import Any, Optional, Union import torch @@ -23,8 +23,6 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device: torch.device): super().__init__() self.device: torch.device = device - self.world_size = 1 - self.local_rank = 0 @property def on_tpu(self) -> bool: From f668c3a1576fcdfc170afe290ee7e06e6d2bb669 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 12 Mar 2021 08:45:02 +0000 Subject: [PATCH 49/54] update --- pytorch_lightning/plugins/training_type/single_device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 9a54309c9a980..16847d26a2e2b 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -23,6 +23,8 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device: torch.device): super().__init__() self.device: torch.device = device + self.world_size = 1 + self.local_rank = 0 @property def on_tpu(self) -> bool: From 30feb4073181380f02e4ceb31253d6469e9cdfdf Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 12 Mar 2021 11:19:17 +0000 Subject: [PATCH 50/54] update --- pytorch_lightning/plugins/training_type/single_device.py | 2 -- .../plugins/training_type/training_type_plugin.py | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 16847d26a2e2b..9a54309c9a980 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -23,8 +23,6 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device: torch.device): super().__init__() self.device: torch.device = device - self.world_size = 1 - self.local_rank = 0 @property def on_tpu(self) -> bool: diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index ab0eaed349a2c..f82a296c18158 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -33,7 +33,10 @@ class TrainingTypePlugin(Plugin, ABC): def __init__(self) -> None: self._model = None self._results = None + # This default values will be modified by subclasses self.global_rank = 0 + self.local_rank = 0 + self.world_size = 1 @abstractmethod def connect(self, model: 'Module') -> None: From a4bf623f5ab48c97976165e7df2a763525227a8a Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 12 Mar 2021 11:22:17 +0000 Subject: [PATCH 51/54] update --- pytorch_lightning/plugins/training_type/single_device.py | 3 +++ .../plugins/training_type/training_type_plugin.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 9a54309c9a980..375a84da20d4b 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -23,6 +23,9 @@ class SingleDevicePlugin(TrainingTypePlugin): def __init__(self, device: torch.device): super().__init__() self.device: torch.device = device + self.global_rank = 0 + self.local_rank = 0 + self.world_size = 1 @property def on_tpu(self) -> bool: diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index f82a296c18158..e4d692ef9a870 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -33,10 +33,10 @@ class TrainingTypePlugin(Plugin, ABC): def __init__(self) -> None: self._model = None self._results = None - # This default values will be modified by subclasses - self.global_rank = 0 - self.local_rank = 0 - self.world_size = 1 + # Those default values will be modified by subclasses + self.global_rank = None + self.local_rank = None + self.world_size = None @abstractmethod def connect(self, model: 'Module') -> None: From b482589009e92477702241702d27e197b44233ce Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 12 Mar 2021 11:44:36 +0000 Subject: [PATCH 52/54] fix --- pytorch_lightning/plugins/training_type/parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index db0ed4a65d371..b23460e0a478e 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -36,6 +36,8 @@ def __init__( super().__init__() self.parallel_devices = parallel_devices self.cluster_environment = cluster_environment + self.local_rank = 0 + self.world_size = 1 @property @abstractmethod From 1ad9c62b7927d3dd30db501481d4752f152d10a9 Mon Sep 17 00:00:00 2001 From: tchaton Date: Fri, 12 Mar 2021 11:54:23 +0000 Subject: [PATCH 53/54] remove comments --- pytorch_lightning/plugins/training_type/training_type_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index e4d692ef9a870..f310f42da7c38 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -33,7 +33,6 @@ class TrainingTypePlugin(Plugin, ABC): def __init__(self) -> None: self._model = None self._results = None - # Those default values will be modified by subclasses self.global_rank = None self.local_rank = None self.world_size = None From b64e1051c66a0b0fad1fdcc04285641087f6cc0c Mon Sep 17 00:00:00 2001 From: tchaton Date: Sun, 14 Mar 2021 16:39:02 +0000 Subject: [PATCH 54/54] resolve bugs --- pytorch_lightning/plugins/training_type/parallel.py | 3 ++- .../plugins/training_type/training_type_plugin.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index b23460e0a478e..283d7113795ec 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -36,8 +36,9 @@ def __init__( super().__init__() self.parallel_devices = parallel_devices self.cluster_environment = cluster_environment - self.local_rank = 0 + self.global_rank = 0 self.world_size = 1 + self.local_rank = 0 @property @abstractmethod diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py index f310f42da7c38..534e189dcff02 100644 --- a/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py @@ -33,9 +33,6 @@ class TrainingTypePlugin(Plugin, ABC): def __init__(self) -> None: self._model = None self._results = None - self.global_rank = None - self.local_rank = None - self.world_size = None @abstractmethod def connect(self, model: 'Module') -> None: