Skip to content

Commit

Permalink
Set correct device ids in DDP [wip] (#4297)
Browse files Browse the repository at this point in the history
* repro


debug


c


d


dd


d


d


d


ads


d


d


d


f


rank


f


v


d


d


d


d


d


d


d


d


d


d


d


set 


drop PL_DDP_PID


clean up


keep set gpus


revert


Revert "drop PL_DDP_PID"

This reverts commit 7d88cae.
d


pid


gpus


clean up


clean up 


misconfig?


misconfig


clean


clean

* fix pep

* changelog

* remove script

Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: William Falcon <waf2107@columbia.edu>
  • Loading branch information
3 people authored Oct 24, 2020
1 parent 5641b26 commit 28d45a2
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 35 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))

- Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))

## [1.0.3] - 2020-10-20
Expand Down
16 changes: 3 additions & 13 deletions pytorch_lightning/accelerators/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def select_accelerator(self):

# ddp script mode uses the same flags as TE
# TODO: decouple from TE
if os.environ.get('PL_DDP_PID', False):
if os.environ.get('PL_IN_DDP_SUBPROCESS', False):
use_torchelastic_ddp = False

cluster_env = self._select_environment()
Expand Down Expand Up @@ -397,18 +397,8 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):

# set the correct cuda visible devices (using pci order)
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

# when slurm is managing the task it sets the visible devices
if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ:
if isinstance(data_parallel_device_ids, int):
id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids)))
os.environ["CUDA_VISIBLE_DEVICES"] = id_str
else:
gpu_str = ','.join([str(x) for x in data_parallel_device_ids])
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str

# don't make this debug... this is good UX
devices = os.environ["CUDA_VISIBLE_DEVICES"]
all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
devices = os.environ.get("CUDA_VISIBLE_DEVICES", all_gpu_ids)
log.info(f'LOCAL_RANK: {self.trainer.local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]')

def determine_local_rank(self):
Expand Down
15 changes: 4 additions & 11 deletions pytorch_lightning/accelerators/ddp_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def setup(self, model):
self._call_children_scripts()

# set the task idx
self.task_idx = int(os.environ['PL_DDP_PID'])
self.task_idx = int(os.environ['LOCAL_RANK'])

def _call_children_scripts(self):
assert self.trainer.global_rank == 0
Expand Down Expand Up @@ -106,19 +106,14 @@ def _call_children_scripts(self):
if self.trainer.logger is not None:
os.environ['PL_EXP_VERSION'] = str(self.trainer.logger.version)

gpu_ids = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if len(gpu_ids) == 1:
gpu_ids = f'{gpu_ids},'

num_gpus = max(1, len(gpu_ids.split(',')))

num_gpus = len(self.trainer.data_parallel_device_ids)
os.environ['WORLD_SIZE'] = f'{num_gpus * self.trainer.num_nodes}'

self.interactive_ddp_procs = []
for local_rank in range(1, self.trainer.num_processes):
env_copy = os.environ.copy()
env_copy['LOCAL_RANK'] = f'{local_rank}'
env_copy['PL_DDP_PID'] = str(self.trainer.data_parallel_device_ids[local_rank])

# remove env var if global seed not set
if os.environ.get('PL_GLOBAL_SEED') is None and 'PL_GLOBAL_SEED' in env_copy:
del env_copy['PL_GLOBAL_SEED']
Expand All @@ -137,8 +132,6 @@ def _call_children_scripts(self):
delay = np.random.uniform(1, 5, 1)[0]
sleep(delay)

os.environ['PL_DDP_PID'] = str(0)

def train(self):
model = self.trainer.model

Expand Down Expand Up @@ -180,7 +173,7 @@ def set_world_ranks(self, process_idx):
self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes

def model_to_device(self, model, process_idx):
self.trainer.root_gpu = process_idx
self.trainer.root_gpu = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
torch.cuda.set_device(self.trainer.root_gpu)
model.cuda(self.trainer.root_gpu)

Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/accelerators/ddp_spawn_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def set_world_ranks(self, process_idx):
self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes

def model_to_device(self, model, process_idx, is_master):
gpu_idx = process_idx
gpu_idx = self.trainer.data_parallel_device_ids[self.trainer.local_rank]
self.trainer.root_gpu = gpu_idx
torch.cuda.set_device(self.trainer.root_gpu)
model.cuda(self.trainer.root_gpu)
Expand Down
10 changes: 0 additions & 10 deletions pytorch_lightning/utilities/device_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,18 +129,8 @@ def _sanitize_gpu_ids(gpus: List[int]) -> List[int]:
unmodified gpus variable
"""
all_available_gpus = _get_all_available_gpus()
misconfig = False
for gpu in gpus:
if gpu not in all_available_gpus:
misconfig = True

if misconfig:
# sometimes auto ddp might have different flags
# but this is not what the user intended
# correct for the user
if len(gpus) == len(all_available_gpus):
gpus = all_available_gpus
else:
raise MisconfigurationException(f"""
You requested GPUs: {gpus}
But your machine only has: {all_available_gpus}
Expand Down

0 comments on commit 28d45a2

Please sign in to comment.