From 8ae9ea1fac7e442403e83c719867decfad385b36 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Fri, 18 Jun 2021 16:13:13 +0200 Subject: [PATCH] try nccl --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index ae70e237a250..e4838e61b503 100644 --- a/train.py +++ b/train.py @@ -533,7 +533,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary assert torch.cuda.device_count() > LOCAL_RANK, 'too few GPUS for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) - dist.init_process_group(backend="gloo") # distributed backend + dist.init_process_group(backend="nccl") # distributed backend assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' assert not opt.image_weights, '--image-weights argument is not compatible with DDP training' opt.batch_size = opt.total_batch_size // WORLD_SIZE