From 643e466bf8dfffdb2c37dcc5fd76e428a477b932 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:17:10 -0800
Subject: [PATCH 01/13] Enhanced model+EMA checkpointing

---
 train.py         | 15 ++++++++++-----
 utils/general.py |  4 ++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/train.py b/train.py
index 8533667fe57f..e26325db608f 100644
--- a/train.py
+++ b/train.py
@@ -136,6 +136,9 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                                id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
     loggers = {'wandb': wandb}  # loggers dict
 
+    # EMA
+    ema = ModelEMA(model) if rank in [-1, 0] else None
+
     # Resume
     start_epoch, best_fitness = 0, 0.0
     if pretrained:
@@ -144,6 +147,10 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
             optimizer.load_state_dict(ckpt['optimizer'])
             best_fitness = ckpt['best_fitness']
 
+        # EMA
+        if ckpt.get('ema'):
+            pass
+
         # Results
         if ckpt.get('training_results') is not None:
             results_file.write_text(ckpt['training_results'])  # write results.txt
@@ -173,9 +180,6 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
         logger.info('Using SyncBatchNorm()')
 
-    # EMA
-    ema = ModelEMA(model) if rank in [-1, 0] else None
-
     # DDP mode
     if cuda and rank != -1:
         model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
@@ -378,7 +382,8 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,
                         'training_results': results_file.read_text(),
-                        'model': ema.ema,
+                        'model': model,
+                        'ema': (ema.ema, ema.updates),
                         'optimizer': None if final_epoch else optimizer.state_dict(),
                         'wandb_id': wandb_run.id if wandb else None}
 
@@ -441,7 +446,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
     parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path')
     parser.add_argument('--epochs', type=int, default=300)
     parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
-    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[128, 128], help='[train, test] image sizes')
     parser.add_argument('--rect', action='store_true', help='rectangular training')
     parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
     parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
diff --git a/utils/general.py b/utils/general.py
index 3b5f4629b00a..e5bbc50c6177 100755
--- a/utils/general.py
+++ b/utils/general.py
@@ -484,8 +484,8 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=Non
 def strip_optimizer(f='weights/best.pt', s=''):  # from utils.general import *; strip_optimizer()
     # Strip optimizer from 'f' to finalize training, optionally save as 's'
     x = torch.load(f, map_location=torch.device('cpu'))
-    for key in 'optimizer', 'training_results', 'wandb_id':
-        x[key] = None
+    for k in 'optimizer', 'training_results', 'wandb_id', 'ema':  # keys
+        x[k] = None
     x['epoch'] = -1
     x['model'].half()  # to FP16
     for p in x['model'].parameters():

From 553b58247496ffd4380e599599193d6f69334678 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:23:15 -0800
Subject: [PATCH 02/13] update

---
 train.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/train.py b/train.py
index e26325db608f..66578295d590 100644
--- a/train.py
+++ b/train.py
@@ -148,8 +148,9 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
             best_fitness = ckpt['best_fitness']
 
         # EMA
-        if ckpt.get('ema'):
-            pass
+        if ema and ckpt.get('ema'):
+            ema.ema.load_statedict(ckpt['ema'][0].float().to(device).statedict())
+            ema.updates = ckpt['ema'][1]
 
         # Results
         if ckpt.get('training_results') is not None:

From dc3ca84249031026b4314a24407586dcd9f9a8e9 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:25:23 -0800
Subject: [PATCH 03/13] bug fix

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 66578295d590..b835ec9887b2 100644
--- a/train.py
+++ b/train.py
@@ -149,7 +149,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
 
         # EMA
         if ema and ckpt.get('ema'):
-            ema.ema.load_statedict(ckpt['ema'][0].float().to(device).statedict())
+            ema.ema.load_state_dict(ckpt['ema'][0].float().statedict())
             ema.updates = ckpt['ema'][1]
 
         # Results

From 554b2ed5f005ab4fc3128ef710e5a32ead2fb5d0 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:26:08 -0800
Subject: [PATCH 04/13] bug fix 2

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index b835ec9887b2..f9bf0b42638d 100644
--- a/train.py
+++ b/train.py
@@ -149,7 +149,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
 
         # EMA
         if ema and ckpt.get('ema'):
-            ema.ema.load_state_dict(ckpt['ema'][0].float().statedict())
+            ema.ema.load_state_dict(ckpt['ema'][0].float().state_dict())
             ema.updates = ckpt['ema'][1]
 
         # Results

From 8c677c14c96ff423513c1c8f9c147e631079ba2b Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:27:47 -0800
Subject: [PATCH 05/13] always save optimizer

---
 train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.py b/train.py
index f9bf0b42638d..d73dadd39f6c 100644
--- a/train.py
+++ b/train.py
@@ -385,7 +385,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                         'training_results': results_file.read_text(),
                         'model': model,
                         'ema': (ema.ema, ema.updates),
-                        'optimizer': None if final_epoch else optimizer.state_dict(),
+                        'optimizer': optimizer.state_dict(),
                         'wandb_id': wandb_run.id if wandb else None}
 
                 # Save last, best and delete

From 1dba401ec513e04ec81e841406503df30874940e Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:34:18 -0800
Subject: [PATCH 06/13] ema half

---
 utils/torch_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index 1b1cc2038c55..6ea84b6e508d 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -270,8 +270,8 @@ class ModelEMA:
     def __init__(self, model, decay=0.9999, updates=0):
         # Create EMA
         self.ema = deepcopy(model.module if is_parallel(model) else model).eval()  # FP32 EMA
-        # if next(model.parameters()).device.type != 'cpu':
-        #     self.ema.half()  # FP16 EMA
+        if next(model.parameters()).device.type != 'cpu':
+            self.ema.half()  # FP16 EMA
         self.updates = updates  # number of EMA updates
         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))  # decay exponential ramp (to help early epochs)
         for p in self.ema.parameters():

From 02042ae20dcf445aec3a257868565f912844a2af Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:41:41 -0800
Subject: [PATCH 07/13] remove model.float()

---
 test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test.py b/test.py
index ecd45f5f4943..9f484c809052 100644
--- a/test.py
+++ b/test.py
@@ -272,7 +272,6 @@ def test(data,
     if not training:
         s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
         print(f"Results saved to {save_dir}{s}")
-    model.float()  # for training
     maps = np.zeros(nc) + map
     for i, c in enumerate(ap_class):
         maps[c] = ap[i]

From 976002d987cf547e2622cb296edc70c0331450f9 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 14:48:27 -0800
Subject: [PATCH 08/13] model half

---
 train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index d73dadd39f6c..85ae5a922e35 100644
--- a/train.py
+++ b/train.py
@@ -4,6 +4,7 @@
 import os
 import random
 import time
+from copy import deepcopy
 from pathlib import Path
 from threading import Thread
 
@@ -383,7 +384,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,
                         'training_results': results_file.read_text(),
-                        'model': model,
+                        'model': deepcopy(model).half(),
                         'ema': (ema.ema, ema.updates),
                         'optimizer': optimizer.state_dict(),
                         'wandb_id': wandb_run.id if wandb else None}

From 4d5ab3b5c2a4857be3a966c5286048e091eca844 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 15:00:18 -0800
Subject: [PATCH 09/13] carry ema/model in fp32

---
 test.py              | 1 +
 train.py             | 7 ++++---
 utils/torch_utils.py | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/test.py b/test.py
index 9f484c809052..ecd45f5f4943 100644
--- a/test.py
+++ b/test.py
@@ -272,6 +272,7 @@ def test(data,
     if not training:
         s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
         print(f"Results saved to {save_dir}{s}")
+    model.float()  # for training
     maps = np.zeros(nc) + map
     for i, c in enumerate(ap_class):
         maps[c] = ap[i]
diff --git a/train.py b/train.py
index 85ae5a922e35..439ae453cad3 100644
--- a/train.py
+++ b/train.py
@@ -4,7 +4,6 @@
 import os
 import random
 import time
-from copy import deepcopy
 from pathlib import Path
 from threading import Thread
 
@@ -384,8 +383,8 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,
                         'training_results': results_file.read_text(),
-                        'model': deepcopy(model).half(),
-                        'ema': (ema.ema, ema.updates),
+                        'model': model.half(),
+                        'ema': (ema.ema.half(), ema.updates),
                         'optimizer': optimizer.state_dict(),
                         'wandb_id': wandb_run.id if wandb else None}
 
@@ -394,6 +393,8 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 if best_fitness == fi:
                     torch.save(ckpt, best)
                 del ckpt
+                model.float(), ema.ema.float()
+
         # end epoch ----------------------------------------------------------------------------------------------------
     # end training
 
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index 6ea84b6e508d..1b1cc2038c55 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -270,8 +270,8 @@ class ModelEMA:
     def __init__(self, model, decay=0.9999, updates=0):
         # Create EMA
         self.ema = deepcopy(model.module if is_parallel(model) else model).eval()  # FP32 EMA
-        if next(model.parameters()).device.type != 'cpu':
-            self.ema.half()  # FP16 EMA
+        # if next(model.parameters()).device.type != 'cpu':
+        #     self.ema.half()  # FP16 EMA
         self.updates = updates  # number of EMA updates
         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))  # decay exponential ramp (to help early epochs)
         for p in self.ema.parameters():

From 189bf638d5b5c44f77a729ee5524ee5c0521cded Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 15:06:47 -0800
Subject: [PATCH 10/13] rm model.float()

---
 test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test.py b/test.py
index ecd45f5f4943..9f484c809052 100644
--- a/test.py
+++ b/test.py
@@ -272,7 +272,6 @@ def test(data,
     if not training:
         s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
         print(f"Results saved to {save_dir}{s}")
-    model.float()  # for training
     maps = np.zeros(nc) + map
     for i, c in enumerate(ap_class):
         maps[c] = ap[i]

From bd4cb50a8e5a33aa490da561be5649fb1c6515fb Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 15:10:29 -0800
Subject: [PATCH 11/13] both to float always

---
 train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 439ae453cad3..bfa58c0d6fcf 100644
--- a/train.py
+++ b/train.py
@@ -393,7 +393,8 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 if best_fitness == fi:
                     torch.save(ckpt, best)
                 del ckpt
-                model.float(), ema.ema.float()
+
+            model.float(), ema.ema.float()
 
         # end epoch ----------------------------------------------------------------------------------------------------
     # end training

From a09b8fa3911e2479ceef9a83bc2edb3ac3ad8135 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 15:11:48 -0800
Subject: [PATCH 12/13] cleanup

---
 train.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index bfa58c0d6fcf..f302abbae59b 100644
--- a/train.py
+++ b/train.py
@@ -196,7 +196,6 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
 
     # Process 0
     if rank in [-1, 0]:
-        ema.updates = start_epoch * nb // accumulate  # set EMA updates
         testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, opt,  # testloader
                                        hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1,
                                        world_size=opt.world_size, workers=opt.workers,
@@ -340,8 +339,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         # DDP process 0 or single-GPU
         if rank in [-1, 0]:
             # mAP
-            if ema:
-                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
+            ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
             final_epoch = epoch + 1 == epochs
             if not opt.notest or final_epoch:  # Calculate mAP
                 results, maps, times = test.test(opt.data,
@@ -450,7 +448,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
     parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path')
     parser.add_argument('--epochs', type=int, default=300)
     parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
-    parser.add_argument('--img-size', nargs='+', type=int, default=[128, 128], help='[train, test] image sizes')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes')
     parser.add_argument('--rect', action='store_true', help='rectangular training')
     parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
     parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')

From dcc6a98ec3f0affcaa45990517a2b549209c7b22 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Wed, 24 Feb 2021 15:26:53 -0800
Subject: [PATCH 13/13] cleanup

---
 train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train.py b/train.py
index f302abbae59b..7aa57fa99e24 100644
--- a/train.py
+++ b/train.py
@@ -31,7 +31,7 @@
 from utils.google_utils import attempt_download
 from utils.loss import ComputeLoss
 from utils.plots import plot_images, plot_labels, plot_results, plot_evolution
-from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first
+from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, is_parallel
 
 logger = logging.getLogger(__name__)
 
@@ -381,7 +381,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 ckpt = {'epoch': epoch,
                         'best_fitness': best_fitness,
                         'training_results': results_file.read_text(),
-                        'model': model.half(),
+                        'model': (model.module if is_parallel(model) else model).half(),
                         'ema': (ema.ema.half(), ema.updates),
                         'optimizer': optimizer.state_dict(),
                         'wandb_id': wandb_run.id if wandb else None}