From 9cb0aeecc8d759acda0fad4304c25c0869d9f938 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 5 Sep 2021 19:09:53 +0200 Subject: [PATCH] EarlyStopper updates (#4679) --- train.py | 6 +++--- utils/torch_utils.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 89f86401c187..72aee2cb8883 100644 --- a/train.py +++ b/train.py @@ -344,7 +344,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary # mAP callbacks.on_train_epoch_end(epoch=epoch) ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights']) - final_epoch = epoch + 1 == epochs + final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, @@ -384,7 +384,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary callbacks.on_model_save(last, epoch, final_epoch, best_fitness, fi) # Stop Single-GPU - if stopper(epoch=epoch, fitness=fi): + if RANK == -1 and stopper(epoch=epoch, fitness=fi): break # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 @@ -462,7 +462,7 @@ def parse_opt(known=False): parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--freeze', type=int, default=0, help='Number of layers to freeze. backbone=10, all=24') - parser.add_argument('--patience', type=int, default=30, help='EarlyStopping patience (epochs)') + parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)') opt = parser.parse_known_args()[0] if known else parser.parse_args() return opt diff --git a/utils/torch_utils.py b/utils/torch_utils.py index 2e153921eb10..04e1446bb908 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -298,13 +298,16 @@ class EarlyStopping: def __init__(self, patience=30): self.best_fitness = 0.0 # i.e. mAP self.best_epoch = 0 - self.patience = patience # epochs to wait after fitness stops improving to stop + self.patience = patience or float('inf') # epochs to wait after fitness stops improving to stop + self.possible_stop = False # possible stop may occur next epoch def __call__(self, epoch, fitness): if fitness >= self.best_fitness: # >= 0 to allow for early zero-fitness stage of training self.best_epoch = epoch self.best_fitness = fitness - stop = (epoch - self.best_epoch) >= self.patience # stop training if patience exceeded + delta = epoch - self.best_epoch # epochs without improvement + self.possible_stop = delta >= (self.patience - 1) # possible stop may occur next epoch + stop = delta >= self.patience # stop training if patience exceeded if stop: LOGGER.info(f'EarlyStopping patience {self.patience} exceeded, stopping training.') return stop