From 7a560de1c863bde0d16bf0500a37e9873fd8c1d1 Mon Sep 17 00:00:00 2001 From: yxNONG <62932917+yxNONG@users.noreply.github.com> Date: Tue, 30 Jun 2020 19:06:28 +0800 Subject: [PATCH 1/6] Unify the check point of single and multi GPU save the model.hyp etc to checkpoint when use multi GPU training --- train.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 39dd0e555572..d933a5de074d 100644 --- a/train.py +++ b/train.py @@ -79,7 +79,7 @@ def train(hyp): # Create model model = Model(opt.cfg).to(device) assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc']) - model.names = data_dict['names'] + # Image sizes gs = int(max(model.stride)) # grid size (max stride) @@ -172,6 +172,7 @@ def train(hyp): model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights + model.names = data_dict['names'] # Class frequency labels = np.concatenate(dataset.labels, 0) @@ -314,6 +315,14 @@ def train(hyp): # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: + if hasattr(model, 'module'): + # Duplicate Model parameters for Multi-GPU save + ema.ema.module.nc = model.nc # attach number of classes to model + ema.ema.module.hyp = model.hyp # attach hyperparameters to model + ema.ema.module.gr = model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) + ema.ema.module.class_weights = model.class_weights # attach class weights + ema.ema.module.names = data_dict['names'] + with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, From 65646b3a8c2f6bfe8ee9e2c2be974d0d8c5d1407 Mon Sep 17 00:00:00 2001 From: yxNONG <62932917+yxNONG@users.noreply.github.com> Date: Thu, 2 Jul 2020 13:48:19 +0800 Subject: [PATCH 2/6] Update utils.py --- utils/utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/utils/utils.py b/utils/utils.py index c33f41f71410..220599935360 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -421,7 +421,9 @@ def compute_loss(p, targets, model): # predictions, targets, model ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor lcls, lbox, lobj = ft([0]), ft([0]), ft([0]) tcls, tbox, indices, anchors = build_targets(p, targets, model) # targets - h = model.hyp # hyperparameters + h = model.module.hyp if hasattr(model, 'module') else model.hyp # hyperparameters + nc = model.module.nc if hasattr(model, 'module') else model.nc + gr = model.module.gr if hasattr(model, 'module') else model.gr red = 'mean' # Loss reduction (sum or mean) # Define criteria @@ -455,10 +457,10 @@ def compute_loss(p, targets, model): # predictions, targets, model lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean() # giou loss # Obj - tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio + tobj[b, a, gj, gi] = (1.0 - gr) + gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio # Class - if model.nc > 1: # cls loss (only if multiple classes) + if nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], cn) # targets t[range(nb), tcls[i]] = cp lcls += BCEcls(ps[:, 5:], t) # BCE @@ -477,7 +479,7 @@ def compute_loss(p, targets, model): # predictions, targets, model g = 3.0 # loss gain lobj *= g / bs if nt: - lcls *= g / nt / model.nc + lcls *= g / nt / nc lbox *= g / nt loss = lbox + lobj + lcls @@ -488,6 +490,8 @@ def build_targets(p, targets, model): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) det = model.module.model[-1] if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) \ else model.model[-1] # Detect() module + hyp = model.module.hyp if hasattr(model, 'module') else model.hyp + na, nt = det.na, targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(6, device=targets.device) # normalized to gridspace gain @@ -503,7 +507,7 @@ def build_targets(p, targets, model): a, t, offsets = [], targets * gain, 0 if nt: r = t[None, :, 4:6] / anchors[:, None] # wh ratio - j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t'] # compare + j = torch.max(r, 1. / r).max(2)[0] < hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n) = wh_iou(anchors(3,2), gwh(n,2)) a, t = at[j], t.repeat(na, 1, 1)[j] # filter From dc44124a16990d47d4e8358e9888d0ae814b20f4 Mon Sep 17 00:00:00 2001 From: yxNONG <62932917+yxNONG@users.noreply.github.com> Date: Thu, 2 Jul 2020 13:51:52 +0800 Subject: [PATCH 3/6] Update train.py --- train.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/train.py b/train.py index d933a5de074d..3b7c9a575678 100644 --- a/train.py +++ b/train.py @@ -147,15 +147,6 @@ def train(hyp): # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) - # Initialize distributed training - if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): - dist.init_process_group(backend='nccl', # distributed backend - init_method='tcp://127.0.0.1:9999', # init method - world_size=1, # number of nodes - rank=0) # node rank - model = torch.nn.parallel.DistributedDataParallel(model) - # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html - # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect) @@ -173,6 +164,15 @@ def train(hyp): model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = data_dict['names'] + + # Initialize distributed training + if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): + dist.init_process_group(backend='nccl', # distributed backend + init_method='tcp://127.0.0.1:9999', # init method + world_size=1, # number of nodes + rank=0) # node rank + model = torch.nn.parallel.DistributedDataParallel(model) + # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html # Class frequency labels = np.concatenate(dataset.labels, 0) @@ -289,7 +289,7 @@ def train(hyp): batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), - model=ema.ema, + model=ema.ema.module if hasattr(model, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader) @@ -315,14 +315,6 @@ def train(hyp): # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: - if hasattr(model, 'module'): - # Duplicate Model parameters for Multi-GPU save - ema.ema.module.nc = model.nc # attach number of classes to model - ema.ema.module.hyp = model.hyp # attach hyperparameters to model - ema.ema.module.gr = model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) - ema.ema.module.class_weights = model.class_weights # attach class weights - ema.ema.module.names = data_dict['names'] - with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, From 83aa8e50ad911bbfae0bad2723182d8a30c4cfb5 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 12:00:55 -0700 Subject: [PATCH 4/6] Update train.py --- train.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/train.py b/train.py index 3b7c9a575678..dfc9ecdbdbc5 100644 --- a/train.py +++ b/train.py @@ -79,7 +79,6 @@ def train(hyp): # Create model model = Model(opt.cfg).to(device) assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc']) - # Image sizes gs = int(max(model.stride)) # grid size (max stride) @@ -133,7 +132,13 @@ def train(hyp): with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt + # epochs start_epoch = ckpt['epoch'] + 1 + if epochs < start_epoch: + print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % + (opt.weights, ckpt['epoch'], epochs)) + epochs += ckpt['epoch'] # finetune additional epochs + del ckpt # Mixed precision training https://github.com/NVIDIA/apex @@ -147,6 +152,15 @@ def train(hyp): # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) + # Initialize distributed training + if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): + dist.init_process_group(backend='nccl', # distributed backend + init_method='tcp://127.0.0.1:9999', # init method + world_size=1, # number of nodes + rank=0) # node rank + model = torch.nn.parallel.DistributedDataParallel(model) + # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html + # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect) @@ -155,7 +169,7 @@ def train(hyp): # Testloader testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt, - hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0] + hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset @@ -164,15 +178,6 @@ def train(hyp): model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = data_dict['names'] - - # Initialize distributed training - if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): - dist.init_process_group(backend='nccl', # distributed backend - init_method='tcp://127.0.0.1:9999', # init method - world_size=1, # number of nodes - rank=0) # node rank - model = torch.nn.parallel.DistributedDataParallel(model) - # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html # Class frequency labels = np.concatenate(dataset.labels, 0) @@ -373,7 +378,7 @@ def train(hyp): parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%') parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') opt = parser.parse_args() - opt.weights = last if opt.resume else opt.weights + opt.weights = last if opt.resume and not opt.weights else opt.weights opt.cfg = check_file(opt.cfg) # check file opt.data = check_file(opt.data) # check file print(opt) From 56c412aff62d7f6c246c200d1817309fa7ea8755 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 12:01:43 -0700 Subject: [PATCH 5/6] Update utils.py --- utils/utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/utils/utils.py b/utils/utils.py index 220599935360..305486a5f6a3 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -47,7 +47,7 @@ def check_git_status(): def check_img_size(img_size, s=32): # Verify img_size is a multiple of stride s - new_size = make_divisible(img_size, s) # ceil gs-multiple + new_size = make_divisible(img_size, int(s)) # ceil gs-multiple if new_size != img_size: print('WARNING: --img-size %g must be multiple of max stride %g, updating to %g' % (img_size, s, new_size)) return new_size @@ -421,9 +421,7 @@ def compute_loss(p, targets, model): # predictions, targets, model ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor lcls, lbox, lobj = ft([0]), ft([0]), ft([0]) tcls, tbox, indices, anchors = build_targets(p, targets, model) # targets - h = model.module.hyp if hasattr(model, 'module') else model.hyp # hyperparameters - nc = model.module.nc if hasattr(model, 'module') else model.nc - gr = model.module.gr if hasattr(model, 'module') else model.gr + h = model.hyp # hyperparameters red = 'mean' # Loss reduction (sum or mean) # Define criteria @@ -457,10 +455,10 @@ def compute_loss(p, targets, model): # predictions, targets, model lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean() # giou loss # Obj - tobj[b, a, gj, gi] = (1.0 - gr) + gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio + tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio # Class - if nc > 1: # cls loss (only if multiple classes) + if model.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], cn) # targets t[range(nb), tcls[i]] = cp lcls += BCEcls(ps[:, 5:], t) # BCE @@ -479,7 +477,7 @@ def compute_loss(p, targets, model): # predictions, targets, model g = 3.0 # loss gain lobj *= g / bs if nt: - lcls *= g / nt / nc + lcls *= g / nt / model.nc lbox *= g / nt loss = lbox + lobj + lcls @@ -490,8 +488,6 @@ def build_targets(p, targets, model): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) det = model.module.model[-1] if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) \ else model.model[-1] # Detect() module - hyp = model.module.hyp if hasattr(model, 'module') else model.hyp - na, nt = det.na, targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(6, device=targets.device) # normalized to gridspace gain @@ -507,7 +503,7 @@ def build_targets(p, targets, model): a, t, offsets = [], targets * gain, 0 if nt: r = t[None, :, 4:6] / anchors[:, None] # wh ratio - j = torch.max(r, 1. / r).max(2)[0] < hyp['anchor_t'] # compare + j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n) = wh_iou(anchors(3,2), gwh(n,2)) a, t = at[j], t.repeat(na, 1, 1)[j] # filter From c3c6ba05f67794537836aa5c2038274e9d371fb8 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 12:03:45 -0700 Subject: [PATCH 6/6] Update torch_utils.py --- utils/torch_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/utils/torch_utils.py b/utils/torch_utils.py index e069792e6e3f..a62adc9cf122 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -54,6 +54,11 @@ def time_synchronized(): return time.time() +def is_parallel(model): + # is model is parallel with DP or DDP + return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) + + def initialize_weights(model): for m in model.modules(): t = type(m) @@ -111,8 +116,8 @@ def model_info(model, verbose=False): try: # FLOPS from thop import profile - macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False) - fs = ', %.1f GFLOPS' % (macs / 1E9 * 2) + flops = profile(deepcopy(model), inputs=(torch.zeros(1, 3, 64, 64),), verbose=False)[0] / 1E9 * 2 + fs = ', %.1f GFLOPS' % (flops * 100) # 640x640 FLOPS except: fs = '' @@ -185,7 +190,7 @@ def update(self, model): self.updates += 1 d = self.decay(self.updates) with torch.no_grad(): - if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel): + if is_parallel(model): msd, esd = model.module.state_dict(), self.ema.module.state_dict() else: msd, esd = model.state_dict(), self.ema.state_dict() @@ -196,7 +201,8 @@ def update(self, model): v += (1. - d) * msd[k].detach() def update_attr(self, model): - # Assign attributes (which may change during training) - for k in model.__dict__.keys(): - if not k.startswith('_'): - setattr(self.ema, k, getattr(model, k)) + # Update class attributes + ema = self.ema.module if is_parallel(model) else self.ema + for k, v in model.__dict__.items(): + if not k.startswith('_') and k != 'module': + setattr(ema, k, v)