From 6935a54e603d634f6b0a9026604dc5875d1ca990 Mon Sep 17 00:00:00 2001 From: Giacomo Guiduzzi <10937563+giacomoguiduzzi@users.noreply.github.com> Date: Wed, 29 Jun 2022 12:41:46 +0200 Subject: [PATCH] Implementation of Early Stopping for DDP training (#8345) * Implementation of Early Stopping for DDP training This edit correctly uses the broadcast_object_list() function to send slave processes a boolean so to end the training phase if the variable is True, thus allowing the master process to destroy the process group and terminate. * Update train.py * Update train.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update train.py * Update train.py * Update train.py * Further cleanup This cleans up the definition of broadcast_list and removes the requirement for clear() afterward. Co-authored-by: Glenn Jocher Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- train.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/train.py b/train.py index e1393213bb4b..dd5eeb600a76 100644 --- a/train.py +++ b/train.py @@ -294,7 +294,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = torch.cuda.amp.GradScaler(enabled=amp) - stopper = EarlyStopping(patience=opt.patience) + stopper, stop = EarlyStopping(patience=opt.patience), False compute_loss = ComputeLoss(model) # init loss class callbacks.run('on_train_start') LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n' @@ -402,6 +402,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] + stop = stopper(epoch=epoch, fitness=fi) # early stop check if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr @@ -428,19 +429,14 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) - # Stop Single-GPU - if RANK == -1 and stopper(epoch=epoch, fitness=fi): - break - - # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 - # stop = stopper(epoch=epoch, fitness=fi) - # if RANK == 0: - # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks - - # Stop DPP - # with torch_distributed_zero_first(RANK): - # if stop: - # break # must break all DDP ranks + # EarlyStopping + if RANK != -1: # if DDP training + broadcast_list = [stop if RANK == 0 else None] + dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks + if RANK != 0: + stop = broadcast_list[0] + if stop: + break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training -----------------------------------------------------------------------------------------------------