From cdb9bde181641917504717e162952826fca61a41 Mon Sep 17 00:00:00 2001 From: yxNONG <62932917+yxNONG@users.noreply.github.com> Date: Tue, 30 Jun 2020 19:06:28 +0800 Subject: [PATCH 01/16] Unify the check point of single and multi GPU save the model.hyp etc to checkpoint when use multi GPU training --- train.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 39dd0e555572..d933a5de074d 100644 --- a/train.py +++ b/train.py @@ -79,7 +79,7 @@ def train(hyp): # Create model model = Model(opt.cfg).to(device) assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc']) - model.names = data_dict['names'] + # Image sizes gs = int(max(model.stride)) # grid size (max stride) @@ -172,6 +172,7 @@ def train(hyp): model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights + model.names = data_dict['names'] # Class frequency labels = np.concatenate(dataset.labels, 0) @@ -314,6 +315,14 @@ def train(hyp): # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: + if hasattr(model, 'module'): + # Duplicate Model parameters for Multi-GPU save + ema.ema.module.nc = model.nc # attach number of classes to model + ema.ema.module.hyp = model.hyp # attach hyperparameters to model + ema.ema.module.gr = model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) + ema.ema.module.class_weights = model.class_weights # attach class weights + ema.ema.module.names = data_dict['names'] + with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, From 86784cfdbf123df3945ff5664a3ffe46a2304aa8 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 30 Jun 2020 21:43:53 -0700 Subject: [PATCH 02/16] --resume bug fix #252 --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 27a877157302..fa672d1cd194 100644 --- a/train.py +++ b/train.py @@ -378,7 +378,7 @@ def train(hyp): parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%') parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') opt = parser.parse_args() - opt.weights = last if opt.resume else opt.weights + opt.weights = last if opt.resume and not opt.weights else opt.weights opt.cfg = check_file(opt.cfg) # check file opt.data = check_file(opt.data) # check file print(opt) From 1c86c2cb6600f4d019fcd3ba67bebceeaf982f5a Mon Sep 17 00:00:00 2001 From: edurenye Date: Wed, 1 Jul 2020 10:04:26 +0200 Subject: [PATCH 03/16] Add torchscript files to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5a95798f0f61..07993ab27f15 100755 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ gcp_test*.sh *.pt *.onnx *.mlmodel +*.torchscript darknet53.conv.74 yolov3-tiny.conv.15 From b5659d1195907472d7db5f23bab15d7d3b101891 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 1 Jul 2020 11:44:49 -0700 Subject: [PATCH 04/16] module updates --- models/common.py | 17 ++++++++++------- models/experimental.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/models/common.py b/models/common.py index 3c4a0d729210..6a5972311f77 100644 --- a/models/common.py +++ b/models/common.py @@ -1,9 +1,13 @@ # This file contains modules common to various models - from utils.utils import * +def autopad(k): + # Pad to 'same' + return k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad + + def DWConv(c1, c2, k=1, s=1, act=True): # Depthwise convolution return Conv(c1, c2, k, s, g=math.gcd(c1, c2), act=act) @@ -11,10 +15,9 @@ def DWConv(c1, c2, k=1, s=1, act=True): class Conv(nn.Module): # Standard convolution - def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups super(Conv, self).__init__() - p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # padding - self.conv = nn.Conv2d(c1, c2, k, s, p, groups=g, bias=False) + self.conv = nn.Conv2d(c1, c2, k, s, p or autopad(k), groups=g, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = nn.LeakyReLU(0.1, inplace=True) if act else nn.Identity() @@ -46,7 +49,7 @@ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, nu self.cv1 = Conv(c1, c_, 1, 1) self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) - self.cv4 = Conv(c2, c2, 1, 1) + self.cv4 = Conv(2 * c_, c2, 1, 1) self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) self.act = nn.LeakyReLU(0.1, inplace=True) self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) @@ -79,9 +82,9 @@ def forward(self, x): class Focus(nn.Module): # Focus wh information into c-space - def __init__(self, c1, c2, k=1): + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups super(Focus, self).__init__() - self.conv = Conv(c1 * 4, c2, k, 1) + self.conv = Conv(c1 * 4, c2, k, s, p, g, act) def forward(self, x): # x(b,c,w,h) -> y(b,4c,w/2,h/2) return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)) diff --git a/models/experimental.py b/models/experimental.py index 60cb7aa14cd5..cff9d141446d 100644 --- a/models/experimental.py +++ b/models/experimental.py @@ -1,6 +1,40 @@ +# This file contains experimental modules + from models.common import * +class CrossConv(nn.Module): + # Cross Convolution + def __init__(self, c1, c2, shortcut=True, g=1, e=0.5): # ch_in, ch_out, shortcut, groups, expansion + super(CrossConv, self).__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, (1, 3), 1) + self.cv2 = Conv(c_, c2, (3, 1), 1, g=g) + self.add = shortcut and c1 == c2 + + def forward(self, x): + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + + +class C3(nn.Module): + # Cross Convolution CSP + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + super(C3, self).__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) + self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) + self.cv4 = Conv(2 * c_, c2, 1, 1) + self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) + self.act = nn.LeakyReLU(0.1, inplace=True) + self.m = nn.Sequential(*[CrossConv(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) + + def forward(self, x): + y1 = self.cv3(self.m(self.cv1(x))) + y2 = self.cv2(x) + return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1)))) + + class Sum(nn.Module): # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070 def __init__(self, n, weight=False): # n: number of inputs From f1d67f4110a29292b372aec2b94243ea82a9f7a2 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 1 Jul 2020 15:46:15 -0700 Subject: [PATCH 05/16] update export.py --- models/export.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/models/export.py b/models/export.py index 2aa6ce403ac6..1c78d3a8b19a 100644 --- a/models/export.py +++ b/models/export.py @@ -1,4 +1,4 @@ -"""Exports a YOLOv5 *.pt model to *.onnx and *.torchscript formats +"""Exports a YOLOv5 *.pt model to ONNX and TorchScript formats Usage: $ export PYTHONPATH="$PWD" && python models/export.py --weights ./weights/yolov5s.pt --img 640 --batch 1 @@ -30,20 +30,20 @@ model.model[-1].export = True # set Detect() layer export=True _ = model(img) # dry run - # Export to torchscript + # Export to TorchScript try: f = opt.weights.replace('.pt', '.torchscript') # filename ts = torch.jit.trace(model, img) ts.save(f) - print('Torchscript export success, saved as %s' % f) - except: - print('Torchscript export failed.') + print('TorchScript export success, saved as %s' % f) + except Exception as e: + print('TorchScript export failed: %s' % e) # Export to ONNX try: f = opt.weights.replace('.pt', '.onnx') # filename model.fuse() # only for ONNX - torch.onnx.export(model, img, f, verbose=False, opset_version=11, input_names=['images'], + torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'], output_names=['output']) # output_names=['classes', 'boxes'] # Checks @@ -51,5 +51,5 @@ onnx.checker.check_model(onnx_model) # check onnx model print(onnx.helper.printable_graph(onnx_model.graph)) # print a human readable representation of the graph print('ONNX export success, saved as %s\nView with https://github.com/lutzroeder/netron' % f) - except: - print('ONNX export failed.') + except Exception as e: + print('ONNX export failed: %s' % e) From a62a1c2c679cc6de730debf1529f073d10180452 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 1 Jul 2020 16:14:49 -0700 Subject: [PATCH 06/16] export.py update --- detect.py | 8 ++++---- models/export.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/detect.py b/detect.py index bb84a0df0c2c..93faf6da4553 100644 --- a/detect.py +++ b/detect.py @@ -156,9 +156,9 @@ def detect(save_img=False): print(opt) with torch.no_grad(): - detect() + # detect() # Update all models - # for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov3-spp.pt']: - # detect() - # create_pretrained(opt.weights, opt.weights) + for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov3-spp.pt']: + detect() + create_pretrained(opt.weights, opt.weights) diff --git a/models/export.py b/models/export.py index 1c78d3a8b19a..bb310f3f89a0 100644 --- a/models/export.py +++ b/models/export.py @@ -6,8 +6,6 @@ import argparse -import onnx - from models.common import * from utils import google_utils @@ -21,7 +19,7 @@ print(opt) # Input - img = torch.zeros((opt.batch_size, 3, *opt.img_size)) # image size, (1, 3, 320, 192) iDetection + img = torch.zeros((opt.batch_size, 3, *opt.img_size)) # image size(1,3,320,192) iDetection # Load PyTorch model google_utils.attempt_download(opt.weights) @@ -30,7 +28,7 @@ model.model[-1].export = True # set Detect() layer export=True _ = model(img) # dry run - # Export to TorchScript + # TorchScript export try: f = opt.weights.replace('.pt', '.torchscript') # filename ts = torch.jit.trace(model, img) @@ -39,8 +37,10 @@ except Exception as e: print('TorchScript export failed: %s' % e) - # Export to ONNX + # ONNX export try: + import onnx + f = opt.weights.replace('.pt', '.onnx') # filename model.fuse() # only for ONNX torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'], From 5323ad224d90680da9cdb7fd8b82089750b7252e Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 1 Jul 2020 16:15:25 -0700 Subject: [PATCH 07/16] export.py update --- detect.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/detect.py b/detect.py index 93faf6da4553..2650c202d49d 100644 --- a/detect.py +++ b/detect.py @@ -156,9 +156,9 @@ def detect(save_img=False): print(opt) with torch.no_grad(): - # detect() + detect() - # Update all models - for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov3-spp.pt']: - detect() - create_pretrained(opt.weights, opt.weights) + # # Update all models + # for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov3-spp.pt']: + # detect() + # create_pretrained(opt.weights, opt.weights) From 1fca7a7f2461f6e178833ebc7d938fea86a6bf84 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 1 Jul 2020 19:15:59 -0700 Subject: [PATCH 08/16] autopad() update in common.py --- models/common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/models/common.py b/models/common.py index 6a5972311f77..2c2d600394c1 100644 --- a/models/common.py +++ b/models/common.py @@ -3,9 +3,11 @@ from utils.utils import * -def autopad(k): +def autopad(k, p=None): # kernel, padding # Pad to 'same' - return k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad + if p is None: + p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad + return p def DWConv(c1, c2, k=1, s=1, act=True): @@ -17,7 +19,7 @@ class Conv(nn.Module): # Standard convolution def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups super(Conv, self).__init__() - self.conv = nn.Conv2d(c1, c2, k, s, p or autopad(k), groups=g, bias=False) + self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False) self.bn = nn.BatchNorm2d(c2) self.act = nn.LeakyReLU(0.1, inplace=True) if act else nn.Identity() From 53cdaf6bf5fcdf28b140d3898d59876ceee5fac0 Mon Sep 17 00:00:00 2001 From: yxNONG <62932917+yxNONG@users.noreply.github.com> Date: Thu, 2 Jul 2020 13:48:19 +0800 Subject: [PATCH 09/16] Update utils.py --- utils/utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/utils/utils.py b/utils/utils.py index c33f41f71410..220599935360 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -421,7 +421,9 @@ def compute_loss(p, targets, model): # predictions, targets, model ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor lcls, lbox, lobj = ft([0]), ft([0]), ft([0]) tcls, tbox, indices, anchors = build_targets(p, targets, model) # targets - h = model.hyp # hyperparameters + h = model.module.hyp if hasattr(model, 'module') else model.hyp # hyperparameters + nc = model.module.nc if hasattr(model, 'module') else model.nc + gr = model.module.gr if hasattr(model, 'module') else model.gr red = 'mean' # Loss reduction (sum or mean) # Define criteria @@ -455,10 +457,10 @@ def compute_loss(p, targets, model): # predictions, targets, model lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean() # giou loss # Obj - tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio + tobj[b, a, gj, gi] = (1.0 - gr) + gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio # Class - if model.nc > 1: # cls loss (only if multiple classes) + if nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], cn) # targets t[range(nb), tcls[i]] = cp lcls += BCEcls(ps[:, 5:], t) # BCE @@ -477,7 +479,7 @@ def compute_loss(p, targets, model): # predictions, targets, model g = 3.0 # loss gain lobj *= g / bs if nt: - lcls *= g / nt / model.nc + lcls *= g / nt / nc lbox *= g / nt loss = lbox + lobj + lcls @@ -488,6 +490,8 @@ def build_targets(p, targets, model): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) det = model.module.model[-1] if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) \ else model.model[-1] # Detect() module + hyp = model.module.hyp if hasattr(model, 'module') else model.hyp + na, nt = det.na, targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(6, device=targets.device) # normalized to gridspace gain @@ -503,7 +507,7 @@ def build_targets(p, targets, model): a, t, offsets = [], targets * gain, 0 if nt: r = t[None, :, 4:6] / anchors[:, None] # wh ratio - j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t'] # compare + j = torch.max(r, 1. / r).max(2)[0] < hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n) = wh_iou(anchors(3,2), gwh(n,2)) a, t = at[j], t.repeat(na, 1, 1)[j] # filter From 1aa2b679333657cc20a702dabb1b5de3315cf577 Mon Sep 17 00:00:00 2001 From: yxNONG <62932917+yxNONG@users.noreply.github.com> Date: Thu, 2 Jul 2020 13:51:52 +0800 Subject: [PATCH 10/16] Update train.py --- train.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/train.py b/train.py index d933a5de074d..3b7c9a575678 100644 --- a/train.py +++ b/train.py @@ -147,15 +147,6 @@ def train(hyp): # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) - # Initialize distributed training - if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): - dist.init_process_group(backend='nccl', # distributed backend - init_method='tcp://127.0.0.1:9999', # init method - world_size=1, # number of nodes - rank=0) # node rank - model = torch.nn.parallel.DistributedDataParallel(model) - # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html - # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect) @@ -173,6 +164,15 @@ def train(hyp): model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = data_dict['names'] + + # Initialize distributed training + if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): + dist.init_process_group(backend='nccl', # distributed backend + init_method='tcp://127.0.0.1:9999', # init method + world_size=1, # number of nodes + rank=0) # node rank + model = torch.nn.parallel.DistributedDataParallel(model) + # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html # Class frequency labels = np.concatenate(dataset.labels, 0) @@ -289,7 +289,7 @@ def train(hyp): batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), - model=ema.ema, + model=ema.ema.module if hasattr(model, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader) @@ -315,14 +315,6 @@ def train(hyp): # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: - if hasattr(model, 'module'): - # Duplicate Model parameters for Multi-GPU save - ema.ema.module.nc = model.nc # attach number of classes to model - ema.ema.module.hyp = model.hyp # attach hyperparameters to model - ema.ema.module.gr = model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) - ema.ema.module.class_weights = model.class_weights # attach class weights - ema.ema.module.names = data_dict['names'] - with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, From 13f69777a69c4a6056c7cb8499c7e3910868122d Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 09:26:03 -0700 Subject: [PATCH 11/16] typo fix --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index fa672d1cd194..0cc3f31003ae 100644 --- a/train.py +++ b/train.py @@ -119,7 +119,7 @@ def train(hyp): model.load_state_dict(ckpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ - "Please delete or update %s and try again, or use --weights '' to train from scatch." \ + "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (opt.weights, opt.cfg, opt.weights, opt.weights) raise KeyError(s) from e From 597ed4ce630071cd5809c65c1322f5b961ba8c9c Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 12:00:55 -0700 Subject: [PATCH 12/16] Update train.py --- train.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/train.py b/train.py index 3b7c9a575678..dfc9ecdbdbc5 100644 --- a/train.py +++ b/train.py @@ -79,7 +79,6 @@ def train(hyp): # Create model model = Model(opt.cfg).to(device) assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc']) - # Image sizes gs = int(max(model.stride)) # grid size (max stride) @@ -133,7 +132,13 @@ def train(hyp): with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt + # epochs start_epoch = ckpt['epoch'] + 1 + if epochs < start_epoch: + print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % + (opt.weights, ckpt['epoch'], epochs)) + epochs += ckpt['epoch'] # finetune additional epochs + del ckpt # Mixed precision training https://github.com/NVIDIA/apex @@ -147,6 +152,15 @@ def train(hyp): # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) + # Initialize distributed training + if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): + dist.init_process_group(backend='nccl', # distributed backend + init_method='tcp://127.0.0.1:9999', # init method + world_size=1, # number of nodes + rank=0) # node rank + model = torch.nn.parallel.DistributedDataParallel(model) + # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html + # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect) @@ -155,7 +169,7 @@ def train(hyp): # Testloader testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt, - hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0] + hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset @@ -164,15 +178,6 @@ def train(hyp): model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = data_dict['names'] - - # Initialize distributed training - if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available(): - dist.init_process_group(backend='nccl', # distributed backend - init_method='tcp://127.0.0.1:9999', # init method - world_size=1, # number of nodes - rank=0) # node rank - model = torch.nn.parallel.DistributedDataParallel(model) - # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html # Class frequency labels = np.concatenate(dataset.labels, 0) @@ -373,7 +378,7 @@ def train(hyp): parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%') parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') opt = parser.parse_args() - opt.weights = last if opt.resume else opt.weights + opt.weights = last if opt.resume and not opt.weights else opt.weights opt.cfg = check_file(opt.cfg) # check file opt.data = check_file(opt.data) # check file print(opt) From fc7c42723d8008438b217072dfa088612ac76225 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 12:01:43 -0700 Subject: [PATCH 13/16] Update utils.py --- utils/utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/utils/utils.py b/utils/utils.py index 220599935360..305486a5f6a3 100755 --- a/utils/utils.py +++ b/utils/utils.py @@ -47,7 +47,7 @@ def check_git_status(): def check_img_size(img_size, s=32): # Verify img_size is a multiple of stride s - new_size = make_divisible(img_size, s) # ceil gs-multiple + new_size = make_divisible(img_size, int(s)) # ceil gs-multiple if new_size != img_size: print('WARNING: --img-size %g must be multiple of max stride %g, updating to %g' % (img_size, s, new_size)) return new_size @@ -421,9 +421,7 @@ def compute_loss(p, targets, model): # predictions, targets, model ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor lcls, lbox, lobj = ft([0]), ft([0]), ft([0]) tcls, tbox, indices, anchors = build_targets(p, targets, model) # targets - h = model.module.hyp if hasattr(model, 'module') else model.hyp # hyperparameters - nc = model.module.nc if hasattr(model, 'module') else model.nc - gr = model.module.gr if hasattr(model, 'module') else model.gr + h = model.hyp # hyperparameters red = 'mean' # Loss reduction (sum or mean) # Define criteria @@ -457,10 +455,10 @@ def compute_loss(p, targets, model): # predictions, targets, model lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean() # giou loss # Obj - tobj[b, a, gj, gi] = (1.0 - gr) + gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio + tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype) # giou ratio # Class - if nc > 1: # cls loss (only if multiple classes) + if model.nc > 1: # cls loss (only if multiple classes) t = torch.full_like(ps[:, 5:], cn) # targets t[range(nb), tcls[i]] = cp lcls += BCEcls(ps[:, 5:], t) # BCE @@ -479,7 +477,7 @@ def compute_loss(p, targets, model): # predictions, targets, model g = 3.0 # loss gain lobj *= g / bs if nt: - lcls *= g / nt / nc + lcls *= g / nt / model.nc lbox *= g / nt loss = lbox + lobj + lcls @@ -490,8 +488,6 @@ def build_targets(p, targets, model): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) det = model.module.model[-1] if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) \ else model.model[-1] # Detect() module - hyp = model.module.hyp if hasattr(model, 'module') else model.hyp - na, nt = det.na, targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(6, device=targets.device) # normalized to gridspace gain @@ -507,7 +503,7 @@ def build_targets(p, targets, model): a, t, offsets = [], targets * gain, 0 if nt: r = t[None, :, 4:6] / anchors[:, None] # wh ratio - j = torch.max(r, 1. / r).max(2)[0] < hyp['anchor_t'] # compare + j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t'] # compare # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t'] # iou(3,n) = wh_iou(anchors(3,2), gwh(n,2)) a, t = at[j], t.repeat(na, 1, 1)[j] # filter From f02481c73a4f8e3dbc0ae809b50310c0b2d700c9 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 12:03:45 -0700 Subject: [PATCH 14/16] Update torch_utils.py --- utils/torch_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/utils/torch_utils.py b/utils/torch_utils.py index e069792e6e3f..a62adc9cf122 100644 --- a/utils/torch_utils.py +++ b/utils/torch_utils.py @@ -54,6 +54,11 @@ def time_synchronized(): return time.time() +def is_parallel(model): + # is model is parallel with DP or DDP + return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) + + def initialize_weights(model): for m in model.modules(): t = type(m) @@ -111,8 +116,8 @@ def model_info(model, verbose=False): try: # FLOPS from thop import profile - macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False) - fs = ', %.1f GFLOPS' % (macs / 1E9 * 2) + flops = profile(deepcopy(model), inputs=(torch.zeros(1, 3, 64, 64),), verbose=False)[0] / 1E9 * 2 + fs = ', %.1f GFLOPS' % (flops * 100) # 640x640 FLOPS except: fs = '' @@ -185,7 +190,7 @@ def update(self, model): self.updates += 1 d = self.decay(self.updates) with torch.no_grad(): - if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel): + if is_parallel(model): msd, esd = model.module.state_dict(), self.ema.module.state_dict() else: msd, esd = model.state_dict(), self.ema.state_dict() @@ -196,7 +201,8 @@ def update(self, model): v += (1. - d) * msd[k].detach() def update_attr(self, model): - # Assign attributes (which may change during training) - for k in model.__dict__.keys(): - if not k.startswith('_'): - setattr(self.ema, k, getattr(model, k)) + # Update class attributes + ema = self.ema.module if is_parallel(model) else self.ema + for k, v in model.__dict__.items(): + if not k.startswith('_') and k != 'module': + setattr(ema, k, v) From 6ca3f35cd4e6834adc116e2d1ebe2defa082c7e8 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 16:41:03 -0700 Subject: [PATCH 15/16] update .dockerignore --- .dockerignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.dockerignore b/.dockerignore index a68626df5f2e..42f241f28c7b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -14,8 +14,10 @@ data/samples/* # Neural Network weights ----------------------------------------------------------------------------------------------- **/*.weights **/*.pt +**/*.pth **/*.onnx **/*.mlmodel +**/*.torchscript # Below Copied From .gitignore ----------------------------------------------------------------------------------------- From 3bdea3f697d4fce36c8e24a0701c0f419fa8f63a Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 2 Jul 2020 21:24:26 -0700 Subject: [PATCH 16/16] strip_optimizer() bug fix #253 --- train.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/train.py b/train.py index aabf4f1f62df..08a0fe5db6be 100644 --- a/train.py +++ b/train.py @@ -336,17 +336,17 @@ def train(hyp): # end epoch ---------------------------------------------------------------------------------------------------- # end training - n = opt.name - if len(n): - n = '_' + n if not n.isnumeric() else n - fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n - for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): - if os.path.exists(f1): - os.rename(f1, f2) # rename - ispt = f2.endswith('.pt') # is *.pt - strip_optimizer(f2) if ispt else None # strip optimizer - os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload - + # Strip optimizers + n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name + fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n + for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): + if os.path.exists(f1): + os.rename(f1, f2) # rename + ispt = f2.endswith('.pt') # is *.pt + strip_optimizer(f2) if ispt else None # strip optimizer + os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload + + # Finish if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))