Skip to content

Commit

Permalink
Remove DDP process group timeout (ultralytics#4422)
Browse files Browse the repository at this point in the history
  • Loading branch information
glenn-jocher authored and CesarBazanAV committed Sep 29, 2021
1 parent 14b416d commit c1fd77d
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def main(opt):
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
torch.cuda.set_device(LOCAL_RANK)
device = torch.device('cuda', LOCAL_RANK)
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")

# Train
if not opt.evolve:
Expand Down
4 changes: 2 additions & 2 deletions utils/torch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int):
Decorator to make all processes in distributed training wait for each local_master to do something.
"""
if local_rank not in [-1, 0]:
dist.barrier()
dist.barrier(device_ids=[local_rank])
yield
if local_rank == 0:
dist.barrier()
dist.barrier(device_ids=[0])


def init_torch_seeds(seed=0):
Expand Down

0 comments on commit c1fd77d

Please sign in to comment.