Lightning-AI · williamFalcon · Oct 24, 2020 · Oct 21, 2020 · Oct 22, 2020 · Oct 22, 2020
@@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))
+
 - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))
 
 ## [1.0.3] - 2020-10-20

@@ -206,7 +206,7 @@ def select_accelerator(self):
 
         # ddp script mode uses the same flags as TE
         # TODO: decouple from TE
-        if os.environ.get('PL_DDP_PID', False):
+        if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
             use_torchelastic_ddp = False
 
         cluster_env = self._select_environment()
@@ -397,18 +397,8 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
 
         # set the correct cuda visible devices (using pci order)
         os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-
-        # when slurm is managing the task it sets the visible devices
-        if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ:
-            if isinstance(data_parallel_device_ids, int):
-                id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids)))
-                os.environ["CUDA_VISIBLE_DEVICES"] = id_str
-            else:
-                gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
-                os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str
-
-        # don't make this debug... this is good UX
-        devices = os.environ["CUDA_VISIBLE_DEVICES"]
+        all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
+        devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')
 
     def determine_local_rank(self):

@@ -62,7 +62,7 @@ def setup(self, model):
             self._call_children_scripts()
 
         # set the task idx
-        self.task_idx = int(os.environ['PL_DDP_PID'])
+        self.task_idx = int(os.environ['LOCAL_RANK'])
 
     def _call_children_scripts(self):
         assert self.trainer.global_rank == 0
@@ -106,19 +106,14 @@ def _call_children_scripts(self):
         if self.trainer.logger is not None:
             os.environ['PL_EXP_VERSION'] = str(self.trainer.logger.version)
 
-        gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '')
-        if len(gpu_ids) == 1:
-            gpu_ids = f'{gpu_ids},'
-
-        num_gpus = max(1, len(gpu_ids.split(',')))
-
+        num_gpus = len(self.trainer.data_parallel_device_ids)
         os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'
 
         self.interactive_ddp_procs = []
         for local_rank in range(1, self.trainer.num_processes):
             env_copy = os.environ.copy()
             env_copy['LOCAL_RANK'] = f'{local_rank}'
-            env_copy['PL_DDP_PID'] = str(self.trainer.data_parallel_device_ids[local_rank])
+
             # remove env var if global seed not set
             if os.environ.get('PL_GLOBAL_SEED') is None and 'PL_GLOBAL_SEED' in env_copy:
                 del env_copy['PL_GLOBAL_SEED']
@@ -137,8 +132,6 @@ def _call_children_scripts(self):
             delay = np.random.uniform(1, 5, 1)[0]
             sleep(delay)
 
-        os.environ['PL_DDP_PID'] = str(0)
-
     def train(self):
         model = self.trainer.model
 
@@ -180,7 +173,7 @@ def set_world_ranks(self, process_idx):
         self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes
 
     def model_to_device(self, model, process_idx):
-        self.trainer.root_gpu = process_idx
+        self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
         torch.cuda.set_device(self.trainer.root_gpu)
         model.cuda(self.trainer.root_gpu)
 

@@ -162,7 +162,7 @@ def set_world_ranks(self, process_idx):
         self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes
 
     def model_to_device(self, model, process_idx, is_master):
-        gpu_idx = process_idx
+        gpu_idx = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
         self.trainer.root_gpu = gpu_idx
         torch.cuda.set_device(self.trainer.root_gpu)
         model.cuda(self.trainer.root_gpu)

@@ -129,18 +129,8 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]:
         unmodified gpus variable
     """
     all_available_gpus = _get_all_available_gpus()
-    misconfig = False
     for gpu in gpus:
         if gpu not in all_available_gpus:
-            misconfig = True
-
-    if misconfig:
-        # sometimes auto ddp might have different flags
-        # but this is not what the user intended
-        # correct for the user
-        if len(gpus) == len(all_available_gpus):
-            gpus = all_available_gpus
-        else:
             raise MisconfigurationException(f"""
                 You requested GPUs: {gpus}
                 But your machine only has: {all_available_gpus}