diff --git a/CHANGELOG.md b/CHANGELOG.md index 098952439562d..8d979b41893f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) + - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) ## [1.0.3] - 2020-10-20 diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index f8dbfb0295b93..df70d90755632 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -206,7 +206,7 @@ def select_accelerator(self): # ddp script mode uses the same flags as TE # TODO: decouple from TE - if os.environ.get('PL_DDP_PID', False): + if os.environ.get('PL_IN_DDP_SUBPROCESS', False): use_torchelastic_ddp = False cluster_env = self._select_environment() @@ -397,18 +397,8 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - - # when slurm is managing the task it sets the visible devices - if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ: - if isinstance(data_parallel_device_ids, int): - id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids))) - os.environ["CUDA_VISIBLE_DEVICES"] = id_str - else: - gpu_str = ','.join([str(x) for x in data_parallel_device_ids]) - os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str - - # don't make this debug... this is good UX - devices = os.environ["CUDA_VISIBLE_DEVICES"] + all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())]) + devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids) log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]') def determine_local_rank(self): diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index e9566dc930a67..b9f01b5ddc167 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -62,7 +62,7 @@ def setup(self, model): self._call_children_scripts() # set the task idx - self.task_idx = int(os.environ['PL_DDP_PID']) + self.task_idx = int(os.environ['LOCAL_RANK']) def _call_children_scripts(self): assert self.trainer.global_rank == 0 @@ -106,19 +106,14 @@ def _call_children_scripts(self): if self.trainer.logger is not None: os.environ['PL_EXP_VERSION'] = str(self.trainer.logger.version) - gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '') - if len(gpu_ids) == 1: - gpu_ids = f'{gpu_ids},' - - num_gpus = max(1, len(gpu_ids.split(','))) - + num_gpus = len(self.trainer.data_parallel_device_ids) os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}' self.interactive_ddp_procs = [] for local_rank in range(1, self.trainer.num_processes): env_copy = os.environ.copy() env_copy['LOCAL_RANK'] = f'{local_rank}' - env_copy['PL_DDP_PID'] = str(self.trainer.data_parallel_device_ids[local_rank]) + # remove env var if global seed not set if os.environ.get('PL_GLOBAL_SEED') is None and 'PL_GLOBAL_SEED' in env_copy: del env_copy['PL_GLOBAL_SEED'] @@ -137,8 +132,6 @@ def _call_children_scripts(self): delay = np.random.uniform(1, 5, 1)[0] sleep(delay) - os.environ['PL_DDP_PID'] = str(0) - def train(self): model = self.trainer.model @@ -180,7 +173,7 @@ def set_world_ranks(self, process_idx): self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes def model_to_device(self, model, process_idx): - self.trainer.root_gpu = process_idx + self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank] torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index 636e944cdccb1..b204494773362 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -162,7 +162,7 @@ def set_world_ranks(self, process_idx): self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes def model_to_device(self, model, process_idx, is_master): - gpu_idx = process_idx + gpu_idx = self.trainer.data_parallel_device_ids[self.trainer.local_rank] self.trainer.root_gpu = gpu_idx torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index f67b09eccf51d..eb0a6fe5c95a4 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -129,18 +129,8 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: unmodified gpus variable """ all_available_gpus = _get_all_available_gpus() - misconfig = False for gpu in gpus: if gpu not in all_available_gpus: - misconfig = True - - if misconfig: - # sometimes auto ddp might have different flags - # but this is not what the user intended - # correct for the user - if len(gpus) == len(all_available_gpus): - gpus = all_available_gpus - else: raise MisconfigurationException(f""" You requested GPUs: {gpus} But your machine only has: {all_available_gpus}