From cc232e3e3557009bfaa0eb60fdda3787178e886d Mon Sep 17 00:00:00 2001 From: Troy Date: Thu, 23 Nov 2023 20:12:05 +0800 Subject: [PATCH] Increase NCCL timeout to 3 hours (#12345) * Increase NCCL timeout to 3 hours When training on a large dataset using DDP, the scanning process will be very long, and it will raise NCCL timeout error. Change the default timeout 30min to 3 hours, same as ultralytics yolov8 (https://github.com/ultralytics/ultralytics/pull/3343) Signed-off-by: Troy * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Troy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher --- train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 004c8eeda121..4c3bec34835f 100644 --- a/train.py +++ b/train.py @@ -23,7 +23,7 @@ import sys import time from copy import deepcopy -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path try: @@ -529,7 +529,8 @@ def main(opt, callbacks=Callbacks()): assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) - dist.init_process_group(backend='nccl' if dist.is_nccl_available() else 'gloo') + dist.init_process_group(backend='nccl' if dist.is_nccl_available() else 'gloo', + timeout=timedelta(seconds=10800)) # Train if not opt.evolve: