From 7a560de1c863bde0d16bf0500a37e9873fd8c1d1 Mon Sep 17 00:00:00 2001
From: yxNONG <62932917+yxNONG@users.noreply.github.com>
Date: Tue, 30 Jun 2020 19:06:28 +0800
Subject: [PATCH 1/6] Unify the check point of single and multi GPU

save the model.hyp etc to checkpoint when use multi GPU training
---
 train.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 39dd0e555572..d933a5de074d 100644
--- a/train.py
+++ b/train.py
@@ -79,7 +79,7 @@ def train(hyp):
     # Create model
     model = Model(opt.cfg).to(device)
     assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc'])
-    model.names = data_dict['names']
+   
 
     # Image sizes
     gs = int(max(model.stride))  # grid size (max stride)
@@ -172,6 +172,7 @@ def train(hyp):
     model.hyp = hyp  # attach hyperparameters to model
     model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
     model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
+    model.names = data_dict['names']
 
     # Class frequency
     labels = np.concatenate(dataset.labels, 0)
@@ -314,6 +315,14 @@ def train(hyp):
         # Save model
         save = (not opt.nosave) or (final_epoch and not opt.evolve)
         if save:
+            if hasattr(model, 'module'):
+                # Duplicate Model parameters for Multi-GPU save
+                ema.ema.module.nc = model.nc  # attach number of classes to model
+                ema.ema.module.hyp = model.hyp  # attach hyperparameters to model
+                ema.ema.module.gr = model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
+                ema.ema.module.class_weights = model.class_weights # attach class weights
+                ema.ema.module.names = data_dict['names']
+                
             with open(results_file, 'r') as f:  # create checkpoint
                 ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,

From 65646b3a8c2f6bfe8ee9e2c2be974d0d8c5d1407 Mon Sep 17 00:00:00 2001
From: yxNONG <62932917+yxNONG@users.noreply.github.com>
Date: Thu, 2 Jul 2020 13:48:19 +0800
Subject: [PATCH 2/6] Update utils.py

---
 utils/utils.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/utils/utils.py b/utils/utils.py
index c33f41f71410..220599935360 100755
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -421,7 +421,9 @@ def compute_loss(p, targets, model):  # predictions, targets, model
     ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor
     lcls, lbox, lobj = ft([0]), ft([0]), ft([0])
     tcls, tbox, indices, anchors = build_targets(p, targets, model)  # targets
-    h = model.hyp  # hyperparameters
+    h = model.module.hyp if hasattr(model, 'module') else model.hyp # hyperparameters
+    nc = model.module.nc if hasattr(model, 'module') else model.nc
+    gr = model.module.gr if hasattr(model, 'module') else model.gr
     red = 'mean'  # Loss reduction (sum or mean)
 
     # Define criteria
@@ -455,10 +457,10 @@ def compute_loss(p, targets, model):  # predictions, targets, model
             lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean()  # giou loss
 
             # Obj
-            tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype)  # giou ratio
+            tobj[b, a, gj, gi] = (1.0 - gr) + gr * giou.detach().clamp(0).type(tobj.dtype)  # giou ratio
 
             # Class
-            if model.nc > 1:  # cls loss (only if multiple classes)
+            if nc > 1:  # cls loss (only if multiple classes)
                 t = torch.full_like(ps[:, 5:], cn)  # targets
                 t[range(nb), tcls[i]] = cp
                 lcls += BCEcls(ps[:, 5:], t)  # BCE
@@ -477,7 +479,7 @@ def compute_loss(p, targets, model):  # predictions, targets, model
         g = 3.0  # loss gain
         lobj *= g / bs
         if nt:
-            lcls *= g / nt / model.nc
+            lcls *= g / nt / nc
             lbox *= g / nt
 
     loss = lbox + lobj + lcls
@@ -488,6 +490,8 @@ def build_targets(p, targets, model):
     # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
     det = model.module.model[-1] if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) \
         else model.model[-1]  # Detect() module
+    hyp = model.module.hyp if hasattr(model, 'module') else model.hyp
+    
     na, nt = det.na, targets.shape[0]  # number of anchors, targets
     tcls, tbox, indices, anch = [], [], [], []
     gain = torch.ones(6, device=targets.device)  # normalized to gridspace gain
@@ -503,7 +507,7 @@ def build_targets(p, targets, model):
         a, t, offsets = [], targets * gain, 0
         if nt:
             r = t[None, :, 4:6] / anchors[:, None]  # wh ratio
-            j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t']  # compare
+            j = torch.max(r, 1. / r).max(2)[0] < hyp['anchor_t']  # compare
             # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n) = wh_iou(anchors(3,2), gwh(n,2))
             a, t = at[j], t.repeat(na, 1, 1)[j]  # filter
 

From dc44124a16990d47d4e8358e9888d0ae814b20f4 Mon Sep 17 00:00:00 2001
From: yxNONG <62932917+yxNONG@users.noreply.github.com>
Date: Thu, 2 Jul 2020 13:51:52 +0800
Subject: [PATCH 3/6] Update train.py

---
 train.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/train.py b/train.py
index d933a5de074d..3b7c9a575678 100644
--- a/train.py
+++ b/train.py
@@ -147,15 +147,6 @@ def train(hyp):
     # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
     # plot_lr_scheduler(optimizer, scheduler, epochs)
 
-    # Initialize distributed training
-    if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
-        dist.init_process_group(backend='nccl',  # distributed backend
-                                init_method='tcp://127.0.0.1:9999',  # init method
-                                world_size=1,  # number of nodes
-                                rank=0)  # node rank
-        model = torch.nn.parallel.DistributedDataParallel(model)
-        # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
-
     # Trainloader
     dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
                                             hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect)
@@ -173,6 +164,15 @@ def train(hyp):
     model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
     model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
     model.names = data_dict['names']
+    
+    # Initialize distributed training
+    if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
+        dist.init_process_group(backend='nccl',  # distributed backend
+                                init_method='tcp://127.0.0.1:9999',  # init method
+                                world_size=1,  # number of nodes
+                                rank=0)  # node rank
+        model = torch.nn.parallel.DistributedDataParallel(model)
+        # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
 
     # Class frequency
     labels = np.concatenate(dataset.labels, 0)
@@ -289,7 +289,7 @@ def train(hyp):
                                              batch_size=batch_size,
                                              imgsz=imgsz_test,
                                              save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'),
-                                             model=ema.ema,
+                                             model=ema.ema.module if hasattr(model, 'module') else ema.ema,
                                              single_cls=opt.single_cls,
                                              dataloader=testloader)
 
@@ -315,14 +315,6 @@ def train(hyp):
         # Save model
         save = (not opt.nosave) or (final_epoch and not opt.evolve)
         if save:
-            if hasattr(model, 'module'):
-                # Duplicate Model parameters for Multi-GPU save
-                ema.ema.module.nc = model.nc  # attach number of classes to model
-                ema.ema.module.hyp = model.hyp  # attach hyperparameters to model
-                ema.ema.module.gr = model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
-                ema.ema.module.class_weights = model.class_weights # attach class weights
-                ema.ema.module.names = data_dict['names']
-                
             with open(results_file, 'r') as f:  # create checkpoint
                 ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,

From 83aa8e50ad911bbfae0bad2723182d8a30c4cfb5 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Thu, 2 Jul 2020 12:00:55 -0700
Subject: [PATCH 4/6] Update train.py

---
 train.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/train.py b/train.py
index 3b7c9a575678..dfc9ecdbdbc5 100644
--- a/train.py
+++ b/train.py
@@ -79,7 +79,6 @@ def train(hyp):
     # Create model
     model = Model(opt.cfg).to(device)
     assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc'])
-   
 
     # Image sizes
     gs = int(max(model.stride))  # grid size (max stride)
@@ -133,7 +132,13 @@ def train(hyp):
             with open(results_file, 'w') as file:
                 file.write(ckpt['training_results'])  # write results.txt
 
+        # epochs
         start_epoch = ckpt['epoch'] + 1
+        if epochs < start_epoch:
+            print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
+                  (opt.weights, ckpt['epoch'], epochs))
+            epochs += ckpt['epoch']  # finetune additional epochs
+
         del ckpt
 
     # Mixed precision training https://github.com/NVIDIA/apex
@@ -147,6 +152,15 @@ def train(hyp):
     # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
     # plot_lr_scheduler(optimizer, scheduler, epochs)
 
+    # Initialize distributed training
+    if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
+        dist.init_process_group(backend='nccl',  # distributed backend
+                                init_method='tcp://127.0.0.1:9999',  # init method
+                                world_size=1,  # number of nodes
+                                rank=0)  # node rank
+        model = torch.nn.parallel.DistributedDataParallel(model)
+        # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
+
     # Trainloader
     dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
                                             hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect)
@@ -155,7 +169,7 @@ def train(hyp):
 
     # Testloader
     testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt,
-                                            hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0]
+                                   hyp=hyp, augment=False, cache=opt.cache_images, rect=True)[0]
 
     # Model parameters
     hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
@@ -164,15 +178,6 @@ def train(hyp):
     model.gr = 1.0  # giou loss ratio (obj_loss = 1.0 or giou)
     model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
     model.names = data_dict['names']
-    
-    # Initialize distributed training
-    if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
-        dist.init_process_group(backend='nccl',  # distributed backend
-                                init_method='tcp://127.0.0.1:9999',  # init method
-                                world_size=1,  # number of nodes
-                                rank=0)  # node rank
-        model = torch.nn.parallel.DistributedDataParallel(model)
-        # pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
 
     # Class frequency
     labels = np.concatenate(dataset.labels, 0)
@@ -373,7 +378,7 @@ def train(hyp):
     parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%')
     parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
     opt = parser.parse_args()
-    opt.weights = last if opt.resume else opt.weights
+    opt.weights = last if opt.resume and not opt.weights else opt.weights
     opt.cfg = check_file(opt.cfg)  # check file
     opt.data = check_file(opt.data)  # check file
     print(opt)

From 56c412aff62d7f6c246c200d1817309fa7ea8755 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Thu, 2 Jul 2020 12:01:43 -0700
Subject: [PATCH 5/6] Update utils.py

---
 utils/utils.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/utils/utils.py b/utils/utils.py
index 220599935360..305486a5f6a3 100755
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -47,7 +47,7 @@ def check_git_status():
 
 def check_img_size(img_size, s=32):
     # Verify img_size is a multiple of stride s
-    new_size = make_divisible(img_size, s)  # ceil gs-multiple
+    new_size = make_divisible(img_size, int(s))  # ceil gs-multiple
     if new_size != img_size:
         print('WARNING: --img-size %g must be multiple of max stride %g, updating to %g' % (img_size, s, new_size))
     return new_size
@@ -421,9 +421,7 @@ def compute_loss(p, targets, model):  # predictions, targets, model
     ft = torch.cuda.FloatTensor if p[0].is_cuda else torch.Tensor
     lcls, lbox, lobj = ft([0]), ft([0]), ft([0])
     tcls, tbox, indices, anchors = build_targets(p, targets, model)  # targets
-    h = model.module.hyp if hasattr(model, 'module') else model.hyp # hyperparameters
-    nc = model.module.nc if hasattr(model, 'module') else model.nc
-    gr = model.module.gr if hasattr(model, 'module') else model.gr
+    h = model.hyp  # hyperparameters
     red = 'mean'  # Loss reduction (sum or mean)
 
     # Define criteria
@@ -457,10 +455,10 @@ def compute_loss(p, targets, model):  # predictions, targets, model
             lbox += (1.0 - giou).sum() if red == 'sum' else (1.0 - giou).mean()  # giou loss
 
             # Obj
-            tobj[b, a, gj, gi] = (1.0 - gr) + gr * giou.detach().clamp(0).type(tobj.dtype)  # giou ratio
+            tobj[b, a, gj, gi] = (1.0 - model.gr) + model.gr * giou.detach().clamp(0).type(tobj.dtype)  # giou ratio
 
             # Class
-            if nc > 1:  # cls loss (only if multiple classes)
+            if model.nc > 1:  # cls loss (only if multiple classes)
                 t = torch.full_like(ps[:, 5:], cn)  # targets
                 t[range(nb), tcls[i]] = cp
                 lcls += BCEcls(ps[:, 5:], t)  # BCE
@@ -479,7 +477,7 @@ def compute_loss(p, targets, model):  # predictions, targets, model
         g = 3.0  # loss gain
         lobj *= g / bs
         if nt:
-            lcls *= g / nt / nc
+            lcls *= g / nt / model.nc
             lbox *= g / nt
 
     loss = lbox + lobj + lcls
@@ -490,8 +488,6 @@ def build_targets(p, targets, model):
     # Build targets for compute_loss(), input targets(image,class,x,y,w,h)
     det = model.module.model[-1] if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) \
         else model.model[-1]  # Detect() module
-    hyp = model.module.hyp if hasattr(model, 'module') else model.hyp
-    
     na, nt = det.na, targets.shape[0]  # number of anchors, targets
     tcls, tbox, indices, anch = [], [], [], []
     gain = torch.ones(6, device=targets.device)  # normalized to gridspace gain
@@ -507,7 +503,7 @@ def build_targets(p, targets, model):
         a, t, offsets = [], targets * gain, 0
         if nt:
             r = t[None, :, 4:6] / anchors[:, None]  # wh ratio
-            j = torch.max(r, 1. / r).max(2)[0] < hyp['anchor_t']  # compare
+            j = torch.max(r, 1. / r).max(2)[0] < model.hyp['anchor_t']  # compare
             # j = wh_iou(anchors, t[:, 4:6]) > model.hyp['iou_t']  # iou(3,n) = wh_iou(anchors(3,2), gwh(n,2))
             a, t = at[j], t.repeat(na, 1, 1)[j]  # filter
 

From c3c6ba05f67794537836aa5c2038274e9d371fb8 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Thu, 2 Jul 2020 12:03:45 -0700
Subject: [PATCH 6/6] Update torch_utils.py

---
 utils/torch_utils.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index e069792e6e3f..a62adc9cf122 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -54,6 +54,11 @@ def time_synchronized():
     return time.time()
 
 
+def is_parallel(model):
+    # is model is parallel with DP or DDP
+    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+
+
 def initialize_weights(model):
     for m in model.modules():
         t = type(m)
@@ -111,8 +116,8 @@ def model_info(model, verbose=False):
 
     try:  # FLOPS
         from thop import profile
-        macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False)
-        fs = ', %.1f GFLOPS' % (macs / 1E9 * 2)
+        flops = profile(deepcopy(model), inputs=(torch.zeros(1, 3, 64, 64),), verbose=False)[0] / 1E9 * 2
+        fs = ', %.1f GFLOPS' % (flops * 100)  # 640x640 FLOPS
     except:
         fs = ''
 
@@ -185,7 +190,7 @@ def update(self, model):
         self.updates += 1
         d = self.decay(self.updates)
         with torch.no_grad():
-            if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel):
+            if is_parallel(model):
                 msd, esd = model.module.state_dict(), self.ema.module.state_dict()
             else:
                 msd, esd = model.state_dict(), self.ema.state_dict()
@@ -196,7 +201,8 @@ def update(self, model):
                     v += (1. - d) * msd[k].detach()
 
     def update_attr(self, model):
-        # Assign attributes (which may change during training)
-        for k in model.__dict__.keys():
-            if not k.startswith('_'):
-                setattr(self.ema, k, getattr(model, k))
+        # Update class attributes
+        ema = self.ema.module if is_parallel(model) else self.ema
+        for k, v in model.__dict__.items():
+            if not k.startswith('_') and k != 'module':
+                setattr(ema, k, v)