From 34b835608b6bdf607bd71a648e0e7f96293b3b92 Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 1 Apr 2021 11:10:56 +0100 Subject: [PATCH] resolve bug --- .../plugins/training_type/tpu_spawn.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py index 4077ef2b01970..85a1c2fe1c2a6 100644 --- a/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py @@ -118,11 +118,13 @@ def new_process(self, process_idx: int, trainer, mp_queue) -> None: self.__save_end_of_training_weights(self.lightning_module) self.transfer_distrib_spawn_state_on_fit_end(results) + # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542 + self.barrier("end-process") + + # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358 if self.global_rank == 0: time.sleep(2) - self.barrier("end-process") - def __save_end_of_training_weights(self, model: LightningModule) -> None: # when training ends on these platforms dump weights to get out of the main process if on_colab_kaggle(): @@ -158,16 +160,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, results): self.mp_queue.put(results) def save(self, state_dict: Dict, path: str) -> None: - """ - Saving with ``xm.save`` can be unstable and miss the rendez-vous after ``torch.save``. - The rendez-vous doesn't affect directly saving. - We can ignore the ``RuntimeError`` to reduce friction with TPUs. - """ - try: - xm.save(state_dict, path) - except RuntimeError as e: - if "Failed to meet rendezvous" not in str(e): - raise e + xm.save(state_dict, path) def broadcast(self, obj: object, src: int = 0) -> object: buffer = io.BytesIO()