diff --git a/train.py b/train.py index 0aa7a13628dc..275e0a4b1a8e 100644 --- a/train.py +++ b/train.py @@ -493,7 +493,7 @@ def main(opt): assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) - dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60)) + dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo") # Train if not opt.evolve: diff --git a/utils/torch_utils.py b/utils/torch_utils.py index dff0617e87c9..2eb51d80f34e 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -35,10 +35,10 @@ def torch_distributed_zero_first(local_rank: int): Decorator to make all processes in distributed training wait for each local_master to do something. """ if local_rank not in [-1, 0]: - dist.barrier() + dist.barrier(device_ids=[local_rank]) yield if local_rank == 0: - dist.barrier() + dist.barrier(device_ids=[0]) def init_torch_seeds(seed=0):