Lightning-AI · williamFalcon · Jul 9, 2020 · Jul 9, 2020 · Jul 9, 2020 · Jul 9, 2020
@@ -189,6 +189,7 @@ class TrainerDDPMixin(ABC):
     num_nodes: int
     node_rank: int
     tpu_cores: int
+    testing: bool
 
     @property
     @abstractmethod
@@ -559,9 +560,13 @@ def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
         torch.cuda.empty_cache()
 
         if self.global_rank == 0 and q is not None:
+            rank_zero_warn('cleaning up ddp environment...')
             q.put(self.checkpoint_callback.best_model_path)
             q.put(results)
 
+            if not self.testing:
+                self.save_spawn_weights(model)
+
         if self.global_rank == 0 and self.distributed_backend != 'ddp_spawn':
             return results
 

@@ -1043,9 +1043,14 @@ def __run_ddp_spawn(self, model, nprocs):
         # restore main state with best weights
         best_path = q.get()
         results = q.get()
+
+        # transfer back the best path to the trainer
         if best_path is not None and len(best_path) > 0:
             self.checkpoint_callback.best_model_path = best_path
-            model.load_from_checkpoint(best_path)
+
+        # load last model weights
+        if self.testing:
+            self.load_spawn_weights(model)
 
         self.model = model
         return results