Shubhamagarwal92 master (Lightning-AI#1349)

* SA: for Lightning-AI#958: set torch cuda device when finding root * SA: for Lightning-AI#958: removing root gpu hack in trainer/evaluation_loop * SA: setting torch cuda device * comment line too long * check if root gpu exists or available * Incorporating suggestions on Lightning-AI#1094 * since root gpu returns none instead of -1 for cpu * undo changes * fixed dp memory thing Co-authored-by: Shubham Agarwal <shubhamagarwal92@gmail.com>
tullie · May 6, 2020 · 6d7a1b4 · 6d7a1b4
1 parent d9f60b4
commit 6d7a1b4
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 0 deletions.
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -526,6 +526,9 @@ def dp_train(self, model):
         if isinstance(device_ids, int):
             device_ids = list(range(device_ids))
 
+        # set dp device
+        torch.cuda.set_device(self.root_gpu)
+
         model = LightningDataParallel(model, device_ids=device_ids)
 
         self.run_pretrain_routine(model)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -389,6 +389,7 @@ def __init__(
         self.gpus = gpus
         self.data_parallel_device_ids = parse_gpu_ids(self.gpus)
         self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids)
+        self.root_device = torch.device("cpu")
 
         # tpu state flags
         self.use_tpu = False