From 27d831b6e4ae4b0286ba0159f5c8542e052cd3c9 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Thu, 7 Jul 2022 18:09:29 +0530 Subject: [PATCH] Training reproducibility improvements (#8213) * attempt at reproducibility * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use deterministic algs * fix everything :) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert dataloader changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * process_batch as np * remove newline * Remove dataloader init fcn * Update val.py * Update train.py * revert additional changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update train.py * Add --seed arg * Update general.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update train.py * Update train.py * Update val.py * Update train.py * Update general.py * Update general.py * Add deterministic argument to init_seeds() Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher --- train.py | 3 ++- utils/general.py | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 3161159ba44d..bf5b4c69d74c 100644 --- a/train.py +++ b/train.py @@ -101,7 +101,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' - init_seeds(1 + RANK) + init_seeds(opt.seed + 1 + RANK, deterministic=True) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] @@ -504,6 +504,7 @@ def parse_opt(known=False): parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)') parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2') parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)') + parser.add_argument('--seed', type=int, default=0, help='Global training seed') parser.add_argument('--local_rank', type=int, default=-1, help='Automatic DDP Multi-GPU argument, do not modify') # Weights & Biases arguments diff --git a/utils/general.py b/utils/general.py index a3e242d78a17..17b689010b39 100755 --- a/utils/general.py +++ b/utils/general.py @@ -195,14 +195,22 @@ def print_args(args: Optional[dict] = None, show_file=True, show_fcn=False): LOGGER.info(colorstr(s) + ', '.join(f'{k}={v}' for k, v in args.items())) -def init_seeds(seed=0): +def init_seeds(seed=0, deterministic=False): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn + + if deterministic and check_version(torch.__version__, '1.12.0'): # https://github.com/ultralytics/yolov5/pull/8213 + torch.use_deterministic_algorithms(True) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + # os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) cudnn.benchmark, cudnn.deterministic = (False, True) if seed == 0 else (True, False) + # torch.cuda.manual_seed(seed) + # torch.cuda.manual_seed_all(seed) # for multi GPU, exception safe def intersect_dicts(da, db, exclude=()):