diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml
index d62cf5c1600d..ee472297107e 100644
--- a/.github/workflows/greetings.yml
+++ b/.github/workflows/greetings.yml
@@ -11,7 +11,7 @@ jobs:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           pr-message: |
             👋 Hello @${{ github.actor }}, thank you for submitting a 🚀 PR! To allow your work to be integrated as seamlessly as possible, we advise you to:
-            - ✅ Verify your PR is **up-to-date with origin/master.** If your PR is behind origin/master update by running the following, replacing 'feature' with the name of your local branch:
+            - ✅ Verify your PR is **up-to-date with origin/master.** If your PR is behind origin/master an automatic [GitHub actions](https://github.com/ultralytics/yolov5/blob/master/.github/workflows/rebase.yml) rebase may be attempted by including the /rebase command in a comment body, or by running the following code, replacing 'feature' with the name of your local branch:
             ```bash
             git remote add upstream https://github.com/ultralytics/yolov5.git
             git fetch upstream
diff --git a/Dockerfile b/Dockerfile
index 98dfee204770..fe64d6da29f9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 FROM nvcr.io/nvidia/pytorch:20.12-py3
 
 # Install linux packages
-RUN apt update && apt install -y screen libgl1-mesa-glx
+RUN apt update && apt install -y zip screen libgl1-mesa-glx
 
 # Install python dependencies
 RUN python -m pip install --upgrade pip
diff --git a/README.md b/README.md
index 3c14071698c5..b7129e80adfe 100755
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 <img src="https://user-images.githubusercontent.com/26833433/98699617-a1595a00-2377-11eb-8145-fc674eb9b1a7.jpg" width="1000"></a>
 &nbsp
 
-![CI CPU testing](https://github.com/ultralytics/yolov5/workflows/CI%20CPU%20testing/badge.svg)
+<a href="https://github.com/ultralytics/yolov5/actions"><img src="https://github.com/ultralytics/yolov5/workflows/CI%20CPU%20testing/badge.svg" alt="CI CPU testing"></a>
 
 This repository represents Ultralytics open-source research into future object detection methods, and incorporates lessons learned and best practices evolved over thousands of hours of training and evolution on anonymized client datasets. **All code and models are under active development, and are subject to modification or deletion without notice.** Use at your own risk.
 
@@ -89,17 +89,15 @@ To run inference on example images in `data/images`:
 ```bash
 $ python detect.py --source data/images --weights yolov5s.pt --conf 0.25
 
-Namespace(agnostic_nms=False, augment=False, classes=None, conf_thres=0.25, device='', img_size=640, iou_thres=0.45, save_conf=False, save_dir='runs/detect', save_txt=False, source='data/images/', update=False, view_img=False, weights=['yolov5s.pt'])
-Using torch 1.7.0+cu101 CUDA:0 (Tesla V100-SXM2-16GB, 16130MB)
-
-Downloading https://github.com/ultralytics/yolov5/releases/download/v3.1/yolov5s.pt to yolov5s.pt... 100%|██████████████| 14.5M/14.5M [00:00<00:00, 21.3MB/s]
+Namespace(agnostic_nms=False, augment=False, classes=None, conf_thres=0.25, device='', exist_ok=False, img_size=640, iou_thres=0.45, name='exp', project='runs/detect', save_conf=False, save_txt=False, source='data/images/', update=False, view_img=False, weights=['yolov5s.pt'])
+YOLOv5 v4.0-96-g83dc1b4 torch 1.7.0+cu101 CUDA:0 (Tesla V100-SXM2-16GB, 16160.5MB)
 
 Fusing layers... 
-Model Summary: 232 layers, 7459581 parameters, 0 gradients
-image 1/2 data/images/bus.jpg: 640x480 4 persons, 1 buss, 1 skateboards, Done. (0.012s)
-image 2/2 data/images/zidane.jpg: 384x640 2 persons, 2 ties, Done. (0.012s)
-Results saved to runs/detect/exp
-Done. (0.113s)
+Model Summary: 224 layers, 7266973 parameters, 0 gradients, 17.0 GFLOPS
+image 1/2 /content/yolov5/data/images/bus.jpg: 640x480 4 persons, 1 bus, Done. (0.010s)
+image 2/2 /content/yolov5/data/images/zidane.jpg: 384x640 2 persons, 1 tie, Done. (0.011s)
+Results saved to runs/detect/exp2
+Done. (0.103s)
 ```
 <img src="https://user-images.githubusercontent.com/26833433/97107365-685a8d80-16c7-11eb-8c2e-83aac701d8b9.jpeg" width="500">  
 
@@ -108,18 +106,17 @@ Done. (0.113s)
 To run **batched inference** with YOLOv5 and [PyTorch Hub](https://github.com/ultralytics/yolov5/issues/36):
 ```python
 import torch
-from PIL import Image
 
 # Model
 model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
 
 # Images
-img1 = Image.open('zidane.jpg')
-img2 = Image.open('bus.jpg')
-imgs = [img1, img2]  # batched list of images
+dir = 'https://github.com/ultralytics/yolov5/raw/master/data/images/'
+imgs = [dir + f for f in ('zidane.jpg', 'bus.jpg')]  # batched list of images
 
 # Inference
-result = model(imgs)
+results = model(imgs)
+results.print()  # or .show(), .save()
 ```
 
 
diff --git a/data/scripts/get_coco.sh b/data/scripts/get_coco.sh
index b0df905c8525..bbb1e9291d5b 100755
--- a/data/scripts/get_coco.sh
+++ b/data/scripts/get_coco.sh
@@ -10,8 +10,9 @@
 # Download/unzip labels
 d='../' # unzip directory
 url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
-f='coco2017labels.zip'                                                                 # 68 MB
-echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove
+f='coco2017labels.zip' # or 'coco2017labels-segments.zip', 68 MB
+echo 'Downloading' $url$f ' ...'
+curl -L $url$f -o $f && unzip -q $f -d $d && rm $f & # download, unzip, remove in background
 
 # Download/unzip images
 d='../coco/images' # unzip directory
@@ -20,7 +21,7 @@ f1='train2017.zip' # 19G, 118k images
 f2='val2017.zip'   # 1G, 5k images
 f3='test2017.zip'  # 7G, 41k images (optional)
 for f in $f1 $f2; do
-  echo 'Downloading' $url$f '...' && curl -L $url$f -o $f # download, (unzip, remove in background)
-  unzip -q $f -d $d && rm $f &
+  echo 'Downloading' $url$f '...'
+  curl -L $url$f -o $f && unzip -q $f -d $d && rm $f & # download, unzip, remove in background
 done
 wait # finish background tasks
diff --git a/data/scripts/get_voc.sh b/data/scripts/get_voc.sh
index 06414b085095..13b83c28d706 100644
--- a/data/scripts/get_voc.sh
+++ b/data/scripts/get_voc.sh
@@ -18,8 +18,8 @@ f1=VOCtrainval_06-Nov-2007.zip # 446MB, 5012 images
 f2=VOCtest_06-Nov-2007.zip     # 438MB, 4953 images
 f3=VOCtrainval_11-May-2012.zip # 1.95GB, 17126 images
 for f in $f3 $f2 $f1; do
-  echo 'Downloading' $url$f '...' && curl -L $url$f -o $f # download, (unzip, remove in background)
-  unzip -q $f -d $d && rm $f &
+  echo 'Downloading' $url$f '...' 
+  curl -L $url$f -o $f && unzip -q $f -d $d && rm $f & # download, unzip, remove in background
 done
 wait # finish background tasks
 
diff --git a/detect.py b/detect.py
index f9085e670916..22bf21b4c825 100644
--- a/detect.py
+++ b/detect.py
@@ -9,8 +9,8 @@
 
 from models.experimental import attempt_load
 from utils.datasets import LoadStreams, LoadImages
-from utils.general import check_img_size, check_requirements, non_max_suppression, apply_classifier, scale_coords, \
-    xyxy2xywh, strip_optimizer, set_logging, increment_path
+from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
+    scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path
 from utils.plots import plot_one_box
 from utils.torch_utils import select_device, load_classifier, time_synchronized
 
@@ -45,7 +45,7 @@ def detect(save_img=False):
     # Set Dataloader
     vid_path, vid_writer = None, None
     if webcam:
-        view_img = True
+        view_img = check_imshow()
         cudnn.benchmark = True  # set True to speed up constant image size inference
         dataset = LoadStreams(source, img_size=imgsz, stride=stride)
     else:
@@ -118,6 +118,7 @@ def detect(save_img=False):
             # Stream results
             if view_img:
                 cv2.imshow(str(p), im0)
+                cv2.waitKey(1)  # 1 millisecond
 
             # Save results (image with detections)
             if save_img:
diff --git a/hubconf.py b/hubconf.py
index 2a34813310e8..47eee4477725 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -133,9 +133,14 @@ def custom(path_or_model='path/to/model.pt', autoshape=True):
     # model = custom(path_or_model='path/to/model.pt')  # custom example
 
     # Verify inference
+    import numpy as np
     from PIL import Image
 
-    imgs = [Image.open(x) for x in Path('data/images').glob('*.jpg')]
-    results = model(imgs)
+    imgs = [Image.open('data/images/bus.jpg'),  # PIL
+            'data/images/zidane.jpg',  # filename
+            'https://github.com/ultralytics/yolov5/raw/master/data/images/bus.jpg',  # URI
+            np.zeros((640, 480, 3))]  # numpy
+
+    results = model(imgs)  # batched inference
     results.print()
     results.save()
diff --git a/models/common.py b/models/common.py
index e8adb66293d5..ad35f908d865 100644
--- a/models/common.py
+++ b/models/common.py
@@ -1,16 +1,17 @@
 # This file contains modules common to various models
 
 import math
+from pathlib import Path
 
 import numpy as np
 import requests
 import torch
 import torch.nn as nn
-from PIL import Image, ImageDraw
+from PIL import Image
 
 from utils.datasets import letterbox
 from utils.general import non_max_suppression, make_divisible, scale_coords, xyxy2xywh
-from utils.plots import color_list
+from utils.plots import color_list, plot_one_box
 
 
 def autopad(k, p=None):  # kernel, padding
@@ -195,10 +196,12 @@ def forward(self, imgs, size=640, augment=False, profile=False):
 
         # Pre-process
         n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])  # number of images, list of images
-        shape0, shape1 = [], []  # image and inference shapes
+        shape0, shape1, files = [], [], []  # image and inference shapes, filenames
         for i, im in enumerate(imgs):
             if isinstance(im, str):  # filename or uri
-                im = Image.open(requests.get(im, stream=True).raw if im.startswith('http') else im)  # open
+                im, f = Image.open(requests.get(im, stream=True).raw if im.startswith('http') else im), im  # open
+                im.filename = f  # for uri
+            files.append(Path(im.filename).with_suffix('.jpg').name if isinstance(im, Image.Image) else f'image{i}.jpg')
             im = np.array(im)  # to numpy
             if im.shape[0] < 5:  # image in CHW
                 im = im.transpose((1, 2, 0))  # reverse dataloader .transpose(2, 0, 1)
@@ -223,25 +226,26 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         for i in range(n):
             scale_coords(shape1, y[i][:, :4], shape0[i])
 
-        return Detections(imgs, y, self.names)
+        return Detections(imgs, y, files, self.names)
 
 
 class Detections:
     # detections class for YOLOv5 inference results
-    def __init__(self, imgs, pred, names=None):
+    def __init__(self, imgs, pred, files, names=None):
         super(Detections, self).__init__()
         d = pred[0].device  # device
         gn = [torch.tensor([*[im.shape[i] for i in [1, 0, 1, 0]], 1., 1.], device=d) for im in imgs]  # normalizations
         self.imgs = imgs  # list of images as numpy arrays
         self.pred = pred  # list of tensors pred[0] = (xyxy, conf, cls)
         self.names = names  # class names
+        self.files = files  # image filenames
         self.xyxy = pred  # xyxy pixels
         self.xywh = [xyxy2xywh(x) for x in pred]  # xywh pixels
         self.xyxyn = [x / g for x, g in zip(self.xyxy, gn)]  # xyxy normalized
         self.xywhn = [x / g for x, g in zip(self.xywh, gn)]  # xywh normalized
         self.n = len(self.pred)
 
-    def display(self, pprint=False, show=False, save=False, render=False):
+    def display(self, pprint=False, show=False, save=False, render=False, save_dir=''):
         colors = color_list()
         for i, (img, pred) in enumerate(zip(self.imgs, self.pred)):
             str = f'image {i + 1}/{len(self.pred)}: {img.shape[0]}x{img.shape[1]} '
@@ -250,16 +254,16 @@ def display(self, pprint=False, show=False, save=False, render=False):
                     n = (pred[:, -1] == c).sum()  # detections per class
                     str += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, "  # add to string
                 if show or save or render:
-                    img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img  # from np
                     for *box, conf, cls in pred:  # xyxy, confidence, class
-                        # str += '%s %.2f, ' % (names[int(cls)], conf)  # label
-                        ImageDraw.Draw(img).rectangle(box, width=4, outline=colors[int(cls) % 10])  # plot
+                        label = f'{self.names[int(cls)]} {conf:.2f}'
+                        plot_one_box(box, img, label=label, color=colors[int(cls) % 10])
+            img = Image.fromarray(img.astype(np.uint8)) if isinstance(img, np.ndarray) else img  # from np
             if pprint:
                 print(str.rstrip(', '))
             if show:
-                img.show(f'image {i}')  # show
+                img.show(self.files[i])  # show
             if save:
-                f = f'results{i}.jpg'
+                f = Path(save_dir) / self.files[i]
                 img.save(f)  # save
                 print(f"{'Saving' * (i == 0)} {f},", end='' if i < self.n - 1 else ' done.\n')
             if render:
@@ -271,8 +275,9 @@ def print(self):
     def show(self):
         self.display(show=True)  # show results
 
-    def save(self):
-        self.display(save=True)  # save results
+    def save(self, save_dir='results/'):
+        Path(save_dir).mkdir(exist_ok=True)
+        self.display(save=True, save_dir=save_dir)  # save results
 
     def render(self):
         self.display(render=True)  # render results
diff --git a/models/export.py b/models/export.py
index 057658af53dc..cc817871f218 100644
--- a/models/export.py
+++ b/models/export.py
@@ -22,6 +22,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument('--weights', type=str, default='./yolov5s.pt', help='weights path')  # from yolov5/models/
     parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size')  # height, width
+    parser.add_argument('--dynamic', action='store_true', help='dynamic ONNX axes')
     parser.add_argument('--batch-size', type=int, default=1, help='batch size')
     opt = parser.parse_args()
     opt.img_size *= 2 if len(opt.img_size) == 1 else 1  # expand
@@ -70,7 +71,9 @@
         print('\nStarting ONNX export with onnx %s...' % onnx.__version__)
         f = opt.weights.replace('.pt', '.onnx')  # filename
         torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'],
-                          output_names=['classes', 'boxes'] if y is None else ['output'])
+                          output_names=['classes', 'boxes'] if y is None else ['output'],
+                          dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'},  # size(1,3,640,640)
+                                        'output': {0: 'batch', 2: 'y', 3: 'x'}} if opt.dynamic else None)
 
         # Checks
         onnx_model = onnx.load(f)  # load onnx model
diff --git a/models/yolo.py b/models/yolo.py
index 11e6a65921a4..85043f2b0205 100644
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -2,7 +2,6 @@
 import logging
 import sys
 from copy import deepcopy
-from pathlib import Path
 
 sys.path.append('./')  # to run '$ python *.py' files in subdirectories
 logger = logging.getLogger(__name__)
@@ -50,7 +49,7 @@ def forward(self, x):
                     self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
 
                 y = x[i].sigmoid()
-                y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                 y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                 z.append(y.view(bs, -1, self.no))
 
@@ -110,9 +109,9 @@ def forward(self, x, augment=False, profile=False):
                 # cv2.imwrite(f'img_{si}.jpg', 255 * xi[0].cpu().numpy().transpose((1, 2, 0))[:, :, ::-1])  # save
                 yi[..., :4] /= si  # de-scale
                 if fi == 2:
-                    yi[..., 1] = img_size[0] - yi[..., 1]  # de-flip ud
+                    yi[..., 1] = img_size[0] - 1 - yi[..., 1]  # de-flip ud
                 elif fi == 3:
-                    yi[..., 0] = img_size[1] - yi[..., 0]  # de-flip lr
+                    yi[..., 0] = img_size[1] - 1 - yi[..., 0]  # de-flip lr
                 y.append(yi)
             return torch.cat(y, 1), None  # augmented inference, train
         else:
@@ -213,43 +212,27 @@ def parse_model(d, ch):  # model_dict, input_channels(3)
         if m in [Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP,
                  C3]:
             c1, c2 = ch[f], args[0]
-
-            # Normal
-            # if i > 0 and args[0] != no:  # channel expansion factor
-            #     ex = 1.75  # exponential (default 2.0)
-            #     e = math.log(c2 / ch[1]) / math.log(2)
-            #     c2 = int(ch[1] * ex ** e)
-            # if m != Focus:
-
-            c2 = make_divisible(c2 * gw, 8) if c2 != no else c2
-
-            # Experimental
-            # if i > 0 and args[0] != no:  # channel expansion factor
-            #     ex = 1 + gw  # exponential (default 2.0)
-            #     ch1 = 32  # ch[1]
-            #     e = math.log(c2 / ch1) / math.log(2)  # level 1-n
-            #     c2 = int(ch1 * ex ** e)
-            # if m != Focus:
-            #     c2 = make_divisible(c2, 8) if c2 != no else c2
+            if c2 != no:  # if not output
+                c2 = make_divisible(c2 * gw, 8)
 
             args = [c1, c2, *args[1:]]
             if m in [BottleneckCSP, C3]:
-                args.insert(2, n)
+                args.insert(2, n)  # number of repeats
                 n = 1
         elif m is nn.BatchNorm2d:
             args = [ch[f]]
         elif m is Concat:
-            c2 = sum([ch[x if x < 0 else x + 1] for x in f])
+            c2 = sum([ch[x] for x in f])
         elif m is Detect:
-            args.append([ch[x + 1] for x in f])
+            args.append([ch[x] for x in f])
             if isinstance(args[1], int):  # number of anchors
                 args[1] = [list(range(args[1] * 2))] * len(f)
         elif m is Contract:
-            c2 = ch[f if f < 0 else f + 1] * args[0] ** 2
+            c2 = ch[f] * args[0] ** 2
         elif m is Expand:
-            c2 = ch[f if f < 0 else f + 1] // args[0] ** 2
+            c2 = ch[f] // args[0] ** 2
         else:
-            c2 = ch[f if f < 0 else f + 1]
+            c2 = ch[f]
 
         m_ = nn.Sequential(*[m(*args) for _ in range(n)]) if n > 1 else m(*args)  # module
         t = str(m)[8:-2].replace('__main__.', '')  # module type
@@ -258,6 +241,8 @@ def parse_model(d, ch):  # model_dict, input_channels(3)
         logger.info('%3s%18s%3s%10.0f  %-40s%-30s' % (i, f, n, np, t, args))  # print
         save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
         layers.append(m_)
+        if i == 0:
+            ch = []
         ch.append(c2)
     return nn.Sequential(*layers), sorted(save)
 
diff --git a/requirements.txt b/requirements.txt
index d22b42f5d786..cb50cf8f32e1 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,8 +21,8 @@ seaborn>=0.11.0
 pandas
 
 # export --------------------------------------
-# coremltools==4.0
-# onnx>=1.8.0
+# coremltools>=4.1
+# onnx>=1.8.1
 # scikit-learn==0.19.2  # for coreml quantization
 
 # extras --------------------------------------
diff --git a/test.py b/test.py
index 738764f15601..91176eca01db 100644
--- a/test.py
+++ b/test.py
@@ -52,7 +52,8 @@ def test(data,
 
         # Load model
         model = attempt_load(weights, map_location=device)  # load FP32 model
-        imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
+        gs = max(int(model.stride.max()), 32)  # grid size (max stride)
+        imgsz = check_img_size(imgsz, s=gs)  # check img_size
 
         # Multi-GPU disabled, incompatible with .half() https://github.com/ultralytics/yolov5/issues/99
         # if device.type != 'cpu' and torch.cuda.device_count() > 1:
@@ -85,7 +86,7 @@ def test(data,
         if device.type != 'cpu':
             model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
         path = data['test'] if opt.task == 'test' else data['val']  # path to val/test images
-        dataloader = create_dataloader(path, imgsz, batch_size, model.stride.max(), opt, pad=0.5, rect=True,
+        dataloader = create_dataloader(path, imgsz, batch_size, gs, opt, pad=0.5, rect=True,
                                        prefix=colorstr('test: ' if opt.task == 'test' else 'val: '))[0]
 
     seen = 0
@@ -106,7 +107,7 @@ def test(data,
         with torch.no_grad():
             # Run model
             t = time_synchronized()
-            inf_out, train_out = model(img, augment=augment)  # inference and training outputs
+            out, train_out = model(img, augment=augment)  # inference and training outputs
             t0 += time_synchronized() - t
 
             # Compute loss
@@ -117,11 +118,11 @@ def test(data,
             targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device)  # to pixels
             lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else []  # for autolabelling
             t = time_synchronized()
-            output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres, labels=lb)
+            out = non_max_suppression(out, conf_thres=conf_thres, iou_thres=iou_thres, labels=lb, multi_label=True)
             t1 += time_synchronized() - t
 
         # Statistics per image
-        for si, pred in enumerate(output):
+        for si, pred in enumerate(out):
             labels = targets[targets[:, 0] == si, 1:]
             nl = len(labels)
             tcls = labels[:, 0].tolist() if nl else []  # target class
@@ -209,7 +210,7 @@ def test(data,
             f = save_dir / f'test_batch{batch_i}_labels.jpg'  # labels
             Thread(target=plot_images, args=(img, targets, paths, f, names), daemon=True).start()
             f = save_dir / f'test_batch{batch_i}_pred.jpg'  # predictions
-            Thread(target=plot_images, args=(img, output_to_target(output), paths, f, names), daemon=True).start()
+            Thread(target=plot_images, args=(img, output_to_target(out), paths, f, names), daemon=True).start()
 
     # Compute statistics
     stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
@@ -268,10 +269,10 @@ def test(data,
             print(f'pycocotools unable to run: {e}')
 
     # Return results
+    model.float()  # for training
     if not training:
         s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
         print(f"Results saved to {save_dir}{s}")
-    model.float()  # for training
     maps = np.zeros(nc) + map
     for i, c in enumerate(ap_class):
         maps[c] = ap[i]
diff --git a/train.py b/train.py
index 4ec97ae71e16..bbf879f3af5f 100644
--- a/train.py
+++ b/train.py
@@ -4,6 +4,7 @@
 import os
 import random
 import time
+from copy import deepcopy
 from pathlib import Path
 from threading import Thread
 
@@ -31,7 +32,7 @@
 from utils.google_utils import attempt_download
 from utils.loss import ComputeLoss
 from utils.plots import plot_images, plot_labels, plot_results, plot_evolution
-from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first
+from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, is_parallel
 
 logger = logging.getLogger(__name__)
 
@@ -120,7 +121,10 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
 
     # Scheduler https://arxiv.org/pdf/1812.01187.pdf
     # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
-    lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
+    if opt.linear_lr:
+        lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf']  # linear
+    else:
+        lf = one_cycle(1, hyp['lrf'], epochs)  # cosine 1->hyp['lrf']
     scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
     # plot_lr_scheduler(optimizer, scheduler, epochs)
 
@@ -130,9 +134,13 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         wandb_run = wandb.init(config=opt, resume="allow",
                                project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
                                name=save_dir.stem,
+                               entity=opt.entity,
                                id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
     loggers = {'wandb': wandb}  # loggers dict
 
+    # EMA
+    ema = ModelEMA(model) if rank in [-1, 0] else None
+
     # Resume
     start_epoch, best_fitness = 0, 0.0
     if pretrained:
@@ -141,10 +149,14 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
             optimizer.load_state_dict(ckpt['optimizer'])
             best_fitness = ckpt['best_fitness']
 
+        # EMA
+        if ema and ckpt.get('ema'):
+            ema.ema.load_state_dict(ckpt['ema'][0].float().state_dict())
+            ema.updates = ckpt['ema'][1]
+
         # Results
         if ckpt.get('training_results') is not None:
-            with open(results_file, 'w') as file:
-                file.write(ckpt['training_results'])  # write results.txt
+            results_file.write_text(ckpt['training_results'])  # write results.txt
 
         # Epochs
         start_epoch = ckpt['epoch'] + 1
@@ -158,7 +170,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         del ckpt, state_dict
 
     # Image sizes
-    gs = int(model.stride.max())  # grid size (max stride)
+    gs = max(int(model.stride.max()), 32)  # grid size (max stride)
     nl = model.model[-1].nl  # number of detection layers (used for scaling hyp['obj'])
     imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples
 
@@ -171,9 +183,6 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
         logger.info('Using SyncBatchNorm()')
 
-    # EMA
-    ema = ModelEMA(model) if rank in [-1, 0] else None
-
     # DDP mode
     if cuda and rank != -1:
         model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
@@ -189,8 +198,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
 
     # Process 0
     if rank in [-1, 0]:
-        ema.updates = start_epoch * nb // accumulate  # set EMA updates
-        testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,  # testloader
+        testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, opt,  # testloader
                                        hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1,
                                        world_size=opt.world_size, workers=opt.workers,
                                        pad=0.5, prefix=colorstr('val: '))[0]
@@ -333,12 +341,11 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         # DDP process 0 or single-GPU
         if rank in [-1, 0]:
             # mAP
-            if ema:
-                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
+            ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
             final_epoch = epoch + 1 == epochs
             if not opt.notest or final_epoch:  # Calculate mAP
                 results, maps, times = test.test(opt.data,
-                                                 batch_size=total_batch_size,
+                                                 batch_size=batch_size * 2,
                                                  imgsz=imgsz_test,
                                                  model=ema.ema,
                                                  single_cls=opt.single_cls,
@@ -351,7 +358,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
 
             # Write
             with open(results_file, 'a') as f:
-                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+                f.write(s + '%10.4g' * 7 % results + '\n')  # append metrics, val_loss
             if len(opt.name) and opt.bucket:
                 os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
 
@@ -372,30 +379,30 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
                 best_fitness = fi
 
             # Save model
-            save = (not opt.nosave) or (final_epoch and not opt.evolve)
-            if save:
-                with open(results_file, 'r') as f:  # create checkpoint
-                    ckpt = {'epoch': epoch,
-                            'best_fitness': best_fitness,
-                            'training_results': f.read(),
-                            'model': ema.ema,
-                            'optimizer': None if final_epoch else optimizer.state_dict(),
-                            'wandb_id': wandb_run.id if wandb else None}
+            if (not opt.nosave) or (final_epoch and not opt.evolve):  # if save
+                ckpt = {'epoch': epoch,
+                        'best_fitness': best_fitness,
+                        'training_results': results_file.read_text(),
+                        'model': deepcopy(model.module if is_parallel(model) else model).half(),
+                        'ema': (deepcopy(ema.ema).half(), ema.updates),
+                        'optimizer': optimizer.state_dict(),
+                        'wandb_id': wandb_run.id if wandb else None}
 
                 # Save last, best and delete
                 torch.save(ckpt, last)
                 if best_fitness == fi:
                     torch.save(ckpt, best)
                 del ckpt
+
         # end epoch ----------------------------------------------------------------------------------------------------
     # end training
 
     if rank in [-1, 0]:
         # Strip optimizers
         final = best if best.exists() else last  # final model
-        for f in [last, best]:
+        for f in last, best:
             if f.exists():
-                strip_optimizer(f)  # strip optimizers
+                strip_optimizer(f)
         if opt.bucket:
             os.system(f'gsutil cp {final} gs://{opt.bucket}/weights')  # upload
 
@@ -412,17 +419,17 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
         # Test best.pt
         logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
         if opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
-            for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]):  # speed, mAP tests
+            for m in (last, best) if best.exists() else (last):  # speed, mAP tests
                 results, _, _ = test.test(opt.data,
-                                          batch_size=total_batch_size,
+                                          batch_size=batch_size * 2,
                                           imgsz=imgsz_test,
-                                          conf_thres=conf,
-                                          iou_thres=iou,
-                                          model=attempt_load(final, device).half(),
+                                          conf_thres=0.001,
+                                          iou_thres=0.7,
+                                          model=attempt_load(m, device).half(),
                                           single_cls=opt.single_cls,
                                           dataloader=testloader,
                                           save_dir=save_dir,
-                                          save_json=save_json,
+                                          save_json=True,
                                           plots=False)
 
     else:
@@ -461,9 +468,11 @@ def train(hyp, opt, device, tb_writer=None, wandb=None):
     parser.add_argument('--log-artifacts', action='store_true', help='log artifacts, i.e. final trained model')
     parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
     parser.add_argument('--project', default='runs/train', help='save to project/name')
+    parser.add_argument('--entity', default=None, help='W&B entity')
     parser.add_argument('--name', default='exp', help='save to project/name')
     parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
     parser.add_argument('--quad', action='store_true', help='quad dataloader')
+    parser.add_argument('--linear-lr', action='store_true', help='linear LR')
     opt = parser.parse_args()
 
     # Set DDP variables
diff --git a/tutorial.ipynb b/tutorial.ipynb
index 3f7133f4f7d7..7fce40c3824e 100644
--- a/tutorial.ipynb
+++ b/tutorial.ipynb
@@ -16,7 +16,7 @@
     "accelerator": "GPU",
     "widgets": {
       "application/vnd.jupyter.widget-state+json": {
-        "811fd52fef65422c8267bafcde8a2c3d": {
+        "1f8e9b8ebded4175b2eaa9f75c3ceb00": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "HBoxModel",
           "state": {
@@ -28,15 +28,15 @@
             "_view_count": null,
             "_view_module_version": "1.5.0",
             "box_style": "",
-            "layout": "IPY_MODEL_8f41b90117224eef9133a9c3a103dbba",
+            "layout": "IPY_MODEL_0a1246a73077468ab80e979cc0576cd2",
             "_model_module": "@jupyter-widgets/controls",
             "children": [
-              "IPY_MODEL_ca2fb37af6ed43d4a74cdc9f2ac5c4a5",
-              "IPY_MODEL_29419ae5ebb9403ea73f7e5a68037bdd"
+              "IPY_MODEL_d327cde5a85a4a51bb8b1b3e9cf06c97",
+              "IPY_MODEL_d5ef1cb2cbed4b87b3c5d292ff2b0da6"
             ]
           }
         },
-        "8f41b90117224eef9133a9c3a103dbba": {
+        "0a1246a73077468ab80e979cc0576cd2": {
           "model_module": "@jupyter-widgets/base",
           "model_name": "LayoutModel",
           "state": {
@@ -87,12 +87,12 @@
             "left": null
           }
         },
-        "ca2fb37af6ed43d4a74cdc9f2ac5c4a5": {
+        "d327cde5a85a4a51bb8b1b3e9cf06c97": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "FloatProgressModel",
           "state": {
             "_view_name": "ProgressView",
-            "style": "IPY_MODEL_6511b4dfb10b48d1bc98bcfb3987bfa0",
+            "style": "IPY_MODEL_8d5dff8bca14435a88fa1814533acd85",
             "_dom_classes": [],
             "description": "100%",
             "_model_name": "FloatProgressModel",
@@ -107,30 +107,30 @@
             "min": 0,
             "description_tooltip": null,
             "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_64f0badf1a8f489885aa984dd62d37dc"
+            "layout": "IPY_MODEL_3d5136c19e7645ca9bc8f51ceffb2be1"
           }
         },
-        "29419ae5ebb9403ea73f7e5a68037bdd": {
+        "d5ef1cb2cbed4b87b3c5d292ff2b0da6": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "HTMLModel",
           "state": {
             "_view_name": "HTMLView",
-            "style": "IPY_MODEL_f569911c5cfc4d81bb1bdfa83447afc8",
+            "style": "IPY_MODEL_2919396dbd4b4c8e821d12bd28665d8a",
             "_dom_classes": [],
             "description": "",
             "_model_name": "HTMLModel",
             "placeholder": "​",
             "_view_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "value": " 781M/781M [00:23&lt;00:00, 34.2MB/s]",
+            "value": " 781M/781M [00:12&lt;00:00, 65.5MB/s]",
             "_view_count": null,
             "_view_module_version": "1.5.0",
             "description_tooltip": null,
             "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_84943ade566440aaa2dcf3b3b27e7074"
+            "layout": "IPY_MODEL_6feb16f2b2fa4021b1a271e1dd442d04"
           }
         },
-        "6511b4dfb10b48d1bc98bcfb3987bfa0": {
+        "8d5dff8bca14435a88fa1814533acd85": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "ProgressStyleModel",
           "state": {
@@ -145,7 +145,7 @@
             "_model_module": "@jupyter-widgets/controls"
           }
         },
-        "64f0badf1a8f489885aa984dd62d37dc": {
+        "3d5136c19e7645ca9bc8f51ceffb2be1": {
           "model_module": "@jupyter-widgets/base",
           "model_name": "LayoutModel",
           "state": {
@@ -196,7 +196,7 @@
             "left": null
           }
         },
-        "f569911c5cfc4d81bb1bdfa83447afc8": {
+        "2919396dbd4b4c8e821d12bd28665d8a": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "DescriptionStyleModel",
           "state": {
@@ -210,7 +210,7 @@
             "_model_module": "@jupyter-widgets/controls"
           }
         },
-        "84943ade566440aaa2dcf3b3b27e7074": {
+        "6feb16f2b2fa4021b1a271e1dd442d04": {
           "model_module": "@jupyter-widgets/base",
           "model_name": "LayoutModel",
           "state": {
@@ -261,7 +261,7 @@
             "left": null
           }
         },
-        "8501ed1563e4452eac9df6b7a66e8f8c": {
+        "e6459e0bcee449b090fc9807672725bc": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "HBoxModel",
           "state": {
@@ -273,15 +273,15 @@
             "_view_count": null,
             "_view_module_version": "1.5.0",
             "box_style": "",
-            "layout": "IPY_MODEL_d2bb96801e1f46f4a58e02534f7026ff",
+            "layout": "IPY_MODEL_c341e1d3bf3b40d1821ce392eb966c68",
             "_model_module": "@jupyter-widgets/controls",
             "children": [
-              "IPY_MODEL_468a796ef06b4a24bcba6fbd4a0a8db5",
-              "IPY_MODEL_42ad5c1ea7be4835bffebf90642178f1"
+              "IPY_MODEL_660afee173694231a6dce3cd94df6cae",
+              "IPY_MODEL_261218485cef48df961519dde5edfcbe"
             ]
           }
         },
-        "d2bb96801e1f46f4a58e02534f7026ff": {
+        "c341e1d3bf3b40d1821ce392eb966c68": {
           "model_module": "@jupyter-widgets/base",
           "model_name": "LayoutModel",
           "state": {
@@ -332,12 +332,12 @@
             "left": null
           }
         },
-        "468a796ef06b4a24bcba6fbd4a0a8db5": {
+        "660afee173694231a6dce3cd94df6cae": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "FloatProgressModel",
           "state": {
             "_view_name": "ProgressView",
-            "style": "IPY_MODEL_c58b5536d98f4814831934e9c30c4d78",
+            "style": "IPY_MODEL_32736d503c06497abfae8c0421918255",
             "_dom_classes": [],
             "description": "100%",
             "_model_name": "FloatProgressModel",
@@ -352,30 +352,30 @@
             "min": 0,
             "description_tooltip": null,
             "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_505597101151486ea29e9ab754544d27"
+            "layout": "IPY_MODEL_e257738711f54d5280c8393d9d3dce1c"
           }
         },
-        "42ad5c1ea7be4835bffebf90642178f1": {
+        "261218485cef48df961519dde5edfcbe": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "HTMLModel",
           "state": {
             "_view_name": "HTMLView",
-            "style": "IPY_MODEL_de6e7b4b4a1c408c9f89d89b07a13bcd",
+            "style": "IPY_MODEL_beb7a6fe34b840899bb79c062681696f",
             "_dom_classes": [],
             "description": "",
             "_model_name": "HTMLModel",
             "placeholder": "​",
             "_view_module": "@jupyter-widgets/controls",
             "_model_module_version": "1.5.0",
-            "value": " 21.1M/21.1M [00:01&lt;00:00, 18.2MB/s]",
+            "value": " 21.1M/21.1M [00:00&lt;00:00, 33.5MB/s]",
             "_view_count": null,
             "_view_module_version": "1.5.0",
             "description_tooltip": null,
             "_model_module": "@jupyter-widgets/controls",
-            "layout": "IPY_MODEL_f5cc9c7d4c274b2d81327ba3163c43fd"
+            "layout": "IPY_MODEL_e639132395d64d70b99d8b72c32f8fbb"
           }
         },
-        "c58b5536d98f4814831934e9c30c4d78": {
+        "32736d503c06497abfae8c0421918255": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "ProgressStyleModel",
           "state": {
@@ -390,7 +390,7 @@
             "_model_module": "@jupyter-widgets/controls"
           }
         },
-        "505597101151486ea29e9ab754544d27": {
+        "e257738711f54d5280c8393d9d3dce1c": {
           "model_module": "@jupyter-widgets/base",
           "model_name": "LayoutModel",
           "state": {
@@ -441,7 +441,7 @@
             "left": null
           }
         },
-        "de6e7b4b4a1c408c9f89d89b07a13bcd": {
+        "beb7a6fe34b840899bb79c062681696f": {
           "model_module": "@jupyter-widgets/controls",
           "model_name": "DescriptionStyleModel",
           "state": {
@@ -455,7 +455,7 @@
             "_model_module": "@jupyter-widgets/controls"
           }
         },
-        "f5cc9c7d4c274b2d81327ba3163c43fd": {
+        "e639132395d64d70b99d8b72c32f8fbb": {
           "model_module": "@jupyter-widgets/base",
           "model_name": "LayoutModel",
           "state": {
@@ -550,7 +550,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "c6ad57c2-40b7-4764-b07d-19ee2ceaabaf"
+        "outputId": "ae8805a9-ce15-4e1c-f6b4-baa1c1033f56"
       },
       "source": [
         "!git clone https://github.com/ultralytics/yolov5  # clone repo\n",
@@ -568,7 +568,7 @@
         {
           "output_type": "stream",
           "text": [
-            "Setup complete. Using torch 1.7.0+cu101 _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16130MB, multi_processor_count=80)\n"
+            "Setup complete. Using torch 1.7.0+cu101 _CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)\n"
           ],
           "name": "stdout"
         }
@@ -672,17 +672,17 @@
           "base_uri": "https://localhost:8080/",
           "height": 65,
           "referenced_widgets": [
-            "811fd52fef65422c8267bafcde8a2c3d",
-            "8f41b90117224eef9133a9c3a103dbba",
-            "ca2fb37af6ed43d4a74cdc9f2ac5c4a5",
-            "29419ae5ebb9403ea73f7e5a68037bdd",
-            "6511b4dfb10b48d1bc98bcfb3987bfa0",
-            "64f0badf1a8f489885aa984dd62d37dc",
-            "f569911c5cfc4d81bb1bdfa83447afc8",
-            "84943ade566440aaa2dcf3b3b27e7074"
+            "1f8e9b8ebded4175b2eaa9f75c3ceb00",
+            "0a1246a73077468ab80e979cc0576cd2",
+            "d327cde5a85a4a51bb8b1b3e9cf06c97",
+            "d5ef1cb2cbed4b87b3c5d292ff2b0da6",
+            "8d5dff8bca14435a88fa1814533acd85",
+            "3d5136c19e7645ca9bc8f51ceffb2be1",
+            "2919396dbd4b4c8e821d12bd28665d8a",
+            "6feb16f2b2fa4021b1a271e1dd442d04"
           ]
         },
-        "outputId": "59a7a546-8492-492e-861d-70a2c85a6794"
+        "outputId": "d6ace7c6-1be5-41ff-d607-1c716b88d298"
       },
       "source": [
         "# Download COCO val2017\n",
@@ -695,7 +695,7 @@
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "811fd52fef65422c8267bafcde8a2c3d",
+              "model_id": "1f8e9b8ebded4175b2eaa9f75c3ceb00",
               "version_minor": 0,
               "version_major": 2
             },
@@ -723,7 +723,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "427c211e-e283-4e87-f7b3-7b8dfb11a4a5"
+        "outputId": "cc25f70c-0a11-44f6-cc44-e92c5083488c"
       },
       "source": [
         "# Run YOLOv5x on COCO val2017\n",
@@ -735,34 +735,33 @@
           "output_type": "stream",
           "text": [
             "Namespace(augment=False, batch_size=32, conf_thres=0.001, data='./data/coco.yaml', device='', exist_ok=False, img_size=640, iou_thres=0.65, name='exp', project='runs/test', save_conf=False, save_hybrid=False, save_json=True, save_txt=False, single_cls=False, task='val', verbose=False, weights=['yolov5x.pt'])\n",
-            "YOLOv5 v4.0-21-gb26a2f6 torch 1.7.0+cu101 CUDA:0 (Tesla V100-SXM2-16GB, 16130.5MB)\n",
+            "YOLOv5 v4.0-75-gbdd88e1 torch 1.7.0+cu101 CUDA:0 (Tesla V100-SXM2-16GB, 16160.5MB)\n",
             "\n",
             "Downloading https://github.com/ultralytics/yolov5/releases/download/v4.0/yolov5x.pt to yolov5x.pt...\n",
-            "100% 168M/168M [00:05<00:00, 31.9MB/s]\n",
+            "100% 168M/168M [00:04<00:00, 39.7MB/s]\n",
             "\n",
             "Fusing layers... \n",
             "Model Summary: 476 layers, 87730285 parameters, 0 gradients, 218.8 GFLOPS\n",
-            "\u001b[34m\u001b[1mval: \u001b[0mScanning '../coco/labels/val2017' for images and labels... 4952 found, 48 missing, 0 empty, 0 corrupted: 100% 5000/5000 [00:01<00:00, 2791.81it/s]\n",
-            "\u001b[34m\u001b[1mval: \u001b[0mNew cache created: ../coco/labels/val2017.cache\n",
-            "\u001b[34m\u001b[1mval: \u001b[0mScanning '../coco/labels/val2017.cache' for images and labels... 4952 found, 48 missing, 0 empty, 0 corrupted: 100% 5000/5000 [00:00<00:00, 13332180.55it/s]\n",
-            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 157/157 [01:30<00:00,  1.73it/s]\n",
-            "                 all       5e+03    3.63e+04       0.419       0.765        0.68       0.486\n",
-            "Speed: 5.2/2.0/7.2 ms inference/NMS/total per 640x640 image at batch-size 32\n",
+            "\u001b[34m\u001b[1mval: \u001b[0mScanning '../coco/val2017' for images and labels... 4952 found, 48 missing, 0 empty, 0 corrupted: 100% 5000/5000 [00:01<00:00, 2824.78it/s]\n",
+            "\u001b[34m\u001b[1mval: \u001b[0mNew cache created: ../coco/val2017.cache\n",
+            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 157/157 [01:33<00:00,  1.68it/s]\n",
+            "                 all       5e+03    3.63e+04       0.749       0.619        0.68       0.486\n",
+            "Speed: 5.2/2.0/7.3 ms inference/NMS/total per 640x640 image at batch-size 32\n",
             "\n",
             "Evaluating pycocotools mAP... saving runs/test/exp/yolov5x_predictions.json...\n",
             "loading annotations into memory...\n",
-            "Done (t=0.41s)\n",
+            "Done (t=0.44s)\n",
             "creating index...\n",
             "index created!\n",
             "Loading and preparing results...\n",
-            "DONE (t=5.26s)\n",
+            "DONE (t=4.47s)\n",
             "creating index...\n",
             "index created!\n",
             "Running per image evaluation...\n",
             "Evaluate annotation type *bbox*\n",
-            "DONE (t=93.97s).\n",
+            "DONE (t=94.87s).\n",
             "Accumulating evaluation results...\n",
-            "DONE (t=15.06s).\n",
+            "DONE (t=15.96s).\n",
             " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.501\n",
             " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.687\n",
             " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.544\n",
@@ -837,17 +836,17 @@
           "base_uri": "https://localhost:8080/",
           "height": 65,
           "referenced_widgets": [
-            "8501ed1563e4452eac9df6b7a66e8f8c",
-            "d2bb96801e1f46f4a58e02534f7026ff",
-            "468a796ef06b4a24bcba6fbd4a0a8db5",
-            "42ad5c1ea7be4835bffebf90642178f1",
-            "c58b5536d98f4814831934e9c30c4d78",
-            "505597101151486ea29e9ab754544d27",
-            "de6e7b4b4a1c408c9f89d89b07a13bcd",
-            "f5cc9c7d4c274b2d81327ba3163c43fd"
+            "e6459e0bcee449b090fc9807672725bc",
+            "c341e1d3bf3b40d1821ce392eb966c68",
+            "660afee173694231a6dce3cd94df6cae",
+            "261218485cef48df961519dde5edfcbe",
+            "32736d503c06497abfae8c0421918255",
+            "e257738711f54d5280c8393d9d3dce1c",
+            "beb7a6fe34b840899bb79c062681696f",
+            "e639132395d64d70b99d8b72c32f8fbb"
           ]
         },
-        "outputId": "c68a3db4-1314-46b4-9e52-83532eb65749"
+        "outputId": "e8b7d5b3-a71e-4446-eec2-ad13419cf700"
       },
       "source": [
         "# Download COCO128\n",
@@ -860,7 +859,7 @@
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "8501ed1563e4452eac9df6b7a66e8f8c",
+              "model_id": "e6459e0bcee449b090fc9807672725bc",
               "version_minor": 0,
               "version_major": 2
             },
@@ -925,7 +924,7 @@
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
-        "outputId": "6af7116a-01ab-4b94-e5d7-b37c17dc95de"
+        "outputId": "38e51b29-2df4-4f00-cde8-5f6e4a34da9e"
       },
       "source": [
         "# Train YOLOv5s on COCO128 for 3 epochs\n",
@@ -937,15 +936,15 @@
           "output_type": "stream",
           "text": [
             "\u001b[34m\u001b[1mgithub: \u001b[0mup to date with https://github.com/ultralytics/yolov5 ✅\n",
-            "YOLOv5 v4.0-21-gb26a2f6 torch 1.7.0+cu101 CUDA:0 (Tesla V100-SXM2-16GB, 16130.5MB)\n",
+            "YOLOv5 v4.0-75-gbdd88e1 torch 1.7.0+cu101 CUDA:0 (Tesla V100-SXM2-16GB, 16160.5MB)\n",
             "\n",
-            "Namespace(adam=False, batch_size=16, bucket='', cache_images=True, cfg='', data='./data/coco128.yaml', device='', epochs=3, evolve=False, exist_ok=False, global_rank=-1, hyp='data/hyp.scratch.yaml', image_weights=False, img_size=[640, 640], local_rank=-1, log_artifacts=False, log_imgs=16, multi_scale=False, name='exp', noautoanchor=False, nosave=True, notest=False, project='runs/train', quad=False, rect=False, resume=False, save_dir='runs/train/exp', single_cls=False, sync_bn=False, total_batch_size=16, weights='yolov5s.pt', workers=8, world_size=1)\n",
+            "Namespace(adam=False, batch_size=16, bucket='', cache_images=True, cfg='', data='./data/coco128.yaml', device='', epochs=3, evolve=False, exist_ok=False, global_rank=-1, hyp='data/hyp.scratch.yaml', image_weights=False, img_size=[640, 640], linear_lr=False, local_rank=-1, log_artifacts=False, log_imgs=16, multi_scale=False, name='exp', noautoanchor=False, nosave=True, notest=False, project='runs/train', quad=False, rect=False, resume=False, save_dir='runs/train/exp', single_cls=False, sync_bn=False, total_batch_size=16, weights='yolov5s.pt', workers=8, world_size=1)\n",
             "\u001b[34m\u001b[1mwandb: \u001b[0mInstall Weights & Biases for YOLOv5 logging with 'pip install wandb' (recommended)\n",
             "Start Tensorboard with \"tensorboard --logdir runs/train\", view at http://localhost:6006/\n",
-            "2021-01-17 19:56:03.945851: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1\n",
+            "2021-02-12 06:38:28.027271: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1\n",
             "\u001b[34m\u001b[1mhyperparameters: \u001b[0mlr0=0.01, lrf=0.2, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, box=0.05, cls=0.5, cls_pw=1.0, obj=1.0, obj_pw=1.0, iou_t=0.2, anchor_t=4.0, fl_gamma=0.0, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, degrees=0.0, translate=0.1, scale=0.5, shear=0.0, perspective=0.0, flipud=0.0, fliplr=0.5, mosaic=1.0, mixup=0.0\n",
             "Downloading https://github.com/ultralytics/yolov5/releases/download/v4.0/yolov5s.pt to yolov5s.pt...\n",
-            "100% 14.1M/14.1M [00:00<00:00, 15.8MB/s]\n",
+            "100% 14.1M/14.1M [00:01<00:00, 13.2MB/s]\n",
             "\n",
             "\n",
             "                 from  n    params  module                                  arguments                     \n",
@@ -979,12 +978,11 @@
             "Transferred 362/362 items from yolov5s.pt\n",
             "Scaled weight_decay = 0.0005\n",
             "Optimizer groups: 62 .bias, 62 conv.weight, 59 other\n",
-            "\u001b[34m\u001b[1mtrain: \u001b[0mScanning '../coco128/labels/train2017' for images and labels... 128 found, 0 missing, 2 empty, 0 corrupted: 100% 128/128 [00:00<00:00, 2647.74it/s]\n",
+            "\u001b[34m\u001b[1mtrain: \u001b[0mScanning '../coco128/labels/train2017' for images and labels... 128 found, 0 missing, 2 empty, 0 corrupted: 100% 128/128 [00:00<00:00, 2566.00it/s]\n",
             "\u001b[34m\u001b[1mtrain: \u001b[0mNew cache created: ../coco128/labels/train2017.cache\n",
-            "\u001b[34m\u001b[1mtrain: \u001b[0mScanning '../coco128/labels/train2017.cache' for images and labels... 128 found, 0 missing, 2 empty, 0 corrupted: 100% 128/128 [00:00<00:00, 1503840.09it/s]\n",
-            "\u001b[34m\u001b[1mtrain: \u001b[0mCaching images (0.1GB): 100% 128/128 [00:00<00:00, 176.03it/s]\n",
-            "\u001b[34m\u001b[1mval: \u001b[0mScanning '../coco128/labels/train2017.cache' for images and labels... 128 found, 0 missing, 2 empty, 0 corrupted: 100% 128/128 [00:00<00:00, 24200.82it/s]\n",
-            "\u001b[34m\u001b[1mval: \u001b[0mCaching images (0.1GB): 100% 128/128 [00:01<00:00, 123.25it/s]\n",
+            "\u001b[34m\u001b[1mtrain: \u001b[0mCaching images (0.1GB): 100% 128/128 [00:00<00:00, 175.07it/s]\n",
+            "\u001b[34m\u001b[1mval: \u001b[0mScanning '../coco128/labels/train2017.cache' for images and labels... 128 found, 0 missing, 2 empty, 0 corrupted: 100% 128/128 [00:00<00:00, 764773.38it/s]\n",
+            "\u001b[34m\u001b[1mval: \u001b[0mCaching images (0.1GB): 100% 128/128 [00:00<00:00, 128.17it/s]\n",
             "Plotting labels... \n",
             "\n",
             "\u001b[34m\u001b[1mautoanchor: \u001b[0mAnalyzing anchors... anchors/target = 4.26, Best Possible Recall (BPR) = 0.9946\n",
@@ -994,19 +992,19 @@
             "Starting training for 3 epochs...\n",
             "\n",
             "     Epoch   gpu_mem       box       obj       cls     total   targets  img_size\n",
-            "       0/2     3.27G   0.04357   0.06779   0.01869    0.1301       207       640: 100% 8/8 [00:04<00:00,  1.95it/s]\n",
-            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 8/8 [00:05<00:00,  1.36it/s]\n",
-            "                 all         128         929       0.392       0.732       0.657       0.428\n",
+            "       0/2     3.27G   0.04357   0.06781   0.01869    0.1301       207       640: 100% 8/8 [00:03<00:00,  2.03it/s]\n",
+            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 4/4 [00:04<00:00,  1.14s/it]\n",
+            "                 all         128         929       0.646       0.627       0.659       0.431\n",
             "\n",
             "     Epoch   gpu_mem       box       obj       cls     total   targets  img_size\n",
-            "       1/2     7.47G   0.04308   0.06636   0.02083    0.1303       227       640: 100% 8/8 [00:02<00:00,  3.88it/s]\n",
-            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 8/8 [00:01<00:00,  5.07it/s]\n",
-            "                 all         128         929       0.387       0.737       0.657       0.432\n",
+            "       1/2     7.75G   0.04308   0.06654   0.02083    0.1304       227       640: 100% 8/8 [00:01<00:00,  4.11it/s]\n",
+            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 4/4 [00:01<00:00,  2.94it/s]\n",
+            "                 all         128         929       0.681       0.607       0.663       0.434\n",
             "\n",
             "     Epoch   gpu_mem       box       obj       cls     total   targets  img_size\n",
-            "       2/2     7.48G   0.04461   0.06864   0.01866    0.1319       191       640: 100% 8/8 [00:02<00:00,  3.57it/s]\n",
-            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 8/8 [00:02<00:00,  2.82it/s]\n",
-            "                 all         128         929       0.385       0.742       0.658       0.431\n",
+            "       2/2     7.75G   0.04461   0.06896   0.01866    0.1322       191       640: 100% 8/8 [00:02<00:00,  3.94it/s]\n",
+            "               Class      Images     Targets           P           R      mAP@.5  mAP@.5:.95: 100% 4/4 [00:03<00:00,  1.22it/s]\n",
+            "                 all         128         929       0.642       0.632       0.662       0.432\n",
             "Optimizer stripped from runs/train/exp/weights/last.pt, 14.8MB\n",
             "3 epochs completed in 0.007 hours.\n",
             "\n"
@@ -1224,6 +1222,19 @@
       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RVRSOhEvUdb5"
+      },
+      "source": [
+        "# Evolve\n",
+        "!python train.py --img 640 --batch 64 --epochs 100 --data coco128.yaml --weights yolov5s.pt --cache --noautoanchor --evolve\n",
+        "!d=runs/train/evolve && cp evolve.* $d && zip -r evolve.zip $d && gsutil mv evolve.zip gs://bucket  # upload results (optional)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "code",
       "metadata": {
@@ -1238,4 +1249,4 @@
       "outputs": []
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/utils/aws/__init__.py b/utils/aws/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/utils/aws/mime.sh b/utils/aws/mime.sh
new file mode 100644
index 000000000000..c319a83cfbdf
--- /dev/null
+++ b/utils/aws/mime.sh
@@ -0,0 +1,26 @@
+# AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/
+# This script will run on every instance restart, not only on first start
+# --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA ---
+
+Content-Type: multipart/mixed; boundary="//"
+MIME-Version: 1.0
+
+--//
+Content-Type: text/cloud-config; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="cloud-config.txt"
+
+#cloud-config
+cloud_final_modules:
+- [scripts-user, always]
+
+--//
+Content-Type: text/x-shellscript; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="userdata.txt"
+
+#!/bin/bash
+# --- paste contents of userdata.sh here ---
+--//
diff --git a/utils/aws/resume.py b/utils/aws/resume.py
new file mode 100644
index 000000000000..563f22be20dc
--- /dev/null
+++ b/utils/aws/resume.py
@@ -0,0 +1,37 @@
+# Resume all interrupted trainings in yolov5/ dir including DPP trainings
+# Usage: $ python utils/aws/resume.py
+
+import os
+import sys
+from pathlib import Path
+
+import torch
+import yaml
+
+sys.path.append('./')  # to run '$ python *.py' files in subdirectories
+
+port = 0  # --master_port
+path = Path('').resolve()
+for last in path.rglob('*/**/last.pt'):
+    ckpt = torch.load(last)
+    if ckpt['optimizer'] is None:
+        continue
+
+    # Load opt.yaml
+    with open(last.parent.parent / 'opt.yaml') as f:
+        opt = yaml.load(f, Loader=yaml.SafeLoader)
+
+    # Get device count
+    d = opt['device'].split(',')  # devices
+    nd = len(d)  # number of devices
+    ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1)  # distributed data parallel
+
+    if ddp:  # multi-GPU
+        port += 1
+        cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
+    else:  # single-GPU
+        cmd = f'python train.py --resume {last}'
+
+    cmd += ' > /dev/null 2>&1 &'  # redirect output to dev/null and run in daemon thread
+    print(cmd)
+    os.system(cmd)
diff --git a/utils/aws/userdata.sh b/utils/aws/userdata.sh
new file mode 100644
index 000000000000..36405d1a1565
--- /dev/null
+++ b/utils/aws/userdata.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html
+# This script will run only once on first instance start (for a re-start script see mime.sh)
+# /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir
+# Use >300 GB SSD
+
+cd home/ubuntu
+if [ ! -d yolov5 ]; then
+  echo "Running first-time script." # install dependencies, download COCO, pull Docker
+  git clone https://github.com/ultralytics/yolov5 && sudo chmod -R 777 yolov5
+  cd yolov5
+  bash data/scripts/get_coco.sh && echo "Data done." &
+  sudo docker pull ultralytics/yolov5:latest && echo "Docker done." &
+  # python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." &
+else
+  echo "Running re-start script." # resume interrupted runs
+  i=0
+  list=$(docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour'
+  while IFS= read -r id; do
+    ((i++))
+    echo "restarting container $i: $id"
+    docker start $id
+    # docker exec -it $id python train.py --resume # single-GPU
+    docker exec -d $id python utils/aws/resume.py
+  done <<<"$list"
+fi
diff --git a/utils/datasets.py b/utils/datasets.py
index 1e23934b63cc..d6ab16518034 100755
--- a/utils/datasets.py
+++ b/utils/datasets.py
@@ -20,12 +20,13 @@
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
-from utils.general import xyxy2xywh, xywh2xyxy, xywhn2xyxy, clean_str
+from utils.general import xyxy2xywh, xywh2xyxy, xywhn2xyxy, xyn2xy, segment2box, segments2boxes, resample_segments, \
+    clean_str
 from utils.torch_utils import torch_distributed_zero_first
 
 # Parameters
 help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
-img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng']  # acceptable image suffixes
+img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng', 'webp']  # acceptable image suffixes
 vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv']  # acceptable video suffixes
 logger = logging.getLogger(__name__)
 
@@ -120,8 +121,7 @@ def __iter__(self):
 
 class LoadImages:  # for inference
     def __init__(self, path, img_size=640, stride=32):
-        p = str(Path(path))  # os-agnostic
-        p = os.path.abspath(p)  # absolute path
+        p = str(Path(path).absolute())  # os-agnostic absolute path
         if '*' in p:
             files = sorted(glob.glob(p, recursive=True))  # glob
         elif os.path.isdir(p):
@@ -300,7 +300,8 @@ def update(self, index, cap):
             # _, self.imgs[index] = cap.read()
             cap.grab()
             if n == 4:  # read every 4th frame
-                _, self.imgs[index] = cap.retrieve()
+                success, im = cap.retrieve()
+                self.imgs[index] = im if success else self.imgs[index] * 0
                 n = 0
             time.sleep(0.01)  # wait time
 
@@ -334,7 +335,7 @@ def __len__(self):
 def img2label_paths(img_paths):
     # Define label paths as a function of image paths
     sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
-    return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths]
+    return ['txt'.join(x.replace(sa, sb, 1).rsplit(x.split('.')[-1], 1)) for x in img_paths]
 
 
 class LoadImagesAndLabels(Dataset):  # for training/testing
@@ -349,44 +350,49 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
         self.mosaic_border = [-img_size // 2, -img_size // 2]
         self.stride = stride
         self.path = path
-        
+
         try:
             f = []  # image files
             for p in path if isinstance(path, list) else [path]:
                 p = Path(p)  # os-agnostic
                 if p.is_dir():  # dir
                     f += glob.glob(str(p / '**' / '*.*'), recursive=True)
+                    # f = list(p.rglob('**/*.*'))  # pathlib
                 elif p.is_file():  # file
                     with open(p, 'r') as t:
                         t = t.read().strip().splitlines()
                         parent = str(p.parent) + os.sep
                         f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
+                        # f += [p.parent / x.lstrip(os.sep) for x in t]  # local to global path (pathlib)
                 else:
                     raise Exception(f'{prefix}{p} does not exist')
             self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats])
+            # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in img_formats])  # pathlib
             assert self.img_files, f'{prefix}No images found'
         except Exception as e:
             raise Exception(f'{prefix}Error loading data from {path}: {e}\nSee {help_url}')
 
         # Check cache
         self.label_files = img2label_paths(self.img_files)  # labels
-        cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')  # cached labels
+        cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')  # cached labels
         if cache_path.is_file():
-            cache = torch.load(cache_path)  # load
-            if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache:  # changed
-                cache = self.cache_labels(cache_path, prefix)  # re-cache
+            cache, exists = torch.load(cache_path), True  # load
+            if cache['hash'] != get_hash(self.label_files + self.img_files) or 'version' not in cache:  # changed
+                cache, exists = self.cache_labels(cache_path, prefix), False  # re-cache
         else:
-            cache = self.cache_labels(cache_path, prefix)  # cache
+            cache, exists = self.cache_labels(cache_path, prefix), False  # cache
 
         # Display cache
-        [nf, nm, ne, nc, n] = cache.pop('results')  # found, missing, empty, corrupted, total
-        desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
-        tqdm(None, desc=prefix + desc, total=n, initial=n)
+        nf, nm, ne, nc, n = cache.pop('results')  # found, missing, empty, corrupted, total
+        if exists:
+            d = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+            tqdm(None, desc=prefix + d, total=n, initial=n)  # display cache results
         assert nf > 0 or not augment, f'{prefix}No labels in {cache_path}. Can not train without labels. See {help_url}'
 
         # Read cache
         cache.pop('hash')  # remove hash
-        labels, shapes = zip(*cache.values())
+        cache.pop('version')  # remove version
+        labels, shapes, self.segments = zip(*cache.values())
         self.labels = list(labels)
         self.shapes = np.array(shapes, dtype=np.float64)
         self.img_files = list(cache.keys())  # update
@@ -449,6 +455,7 @@ def cache_labels(self, path=Path('./labels.cache'), prefix=''):
                 im = Image.open(im_file)
                 im.verify()  # PIL verify
                 shape = exif_size(im)  # image size
+                segments = []  # instance segments
                 assert (shape[0] > 9) & (shape[1] > 9), f'image size {shape} <10 pixels'
                 assert im.format.lower() in img_formats, f'invalid image format {im.format}'
 
@@ -456,7 +463,12 @@ def cache_labels(self, path=Path('./labels.cache'), prefix=''):
                 if os.path.isfile(lb_file):
                     nf += 1  # label found
                     with open(lb_file, 'r') as f:
-                        l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+                        l = [x.split() for x in f.read().strip().splitlines()]
+                        if any([len(x) > 8 for x in l]):  # is segment
+                            classes = np.array([x[0] for x in l], dtype=np.float32)
+                            segments = [np.array(x[1:], dtype=np.float32).reshape(-1, 2) for x in l]  # (cls, xy1...)
+                            l = np.concatenate((classes.reshape(-1, 1), segments2boxes(segments)), 1)  # (cls, xywh)
+                        l = np.array(l, dtype=np.float32)
                     if len(l):
                         assert l.shape[1] == 5, 'labels require 5 columns each'
                         assert (l >= 0).all(), 'negative labels'
@@ -468,7 +480,7 @@ def cache_labels(self, path=Path('./labels.cache'), prefix=''):
                 else:
                     nm += 1  # label missing
                     l = np.zeros((0, 5), dtype=np.float32)
-                x[im_file] = [l, shape]
+                x[im_file] = [l, shape, segments]
             except Exception as e:
                 nc += 1
                 print(f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}')
@@ -480,7 +492,8 @@ def cache_labels(self, path=Path('./labels.cache'), prefix=''):
             print(f'{prefix}WARNING: No labels found in {path}. See {help_url}')
 
         x['hash'] = get_hash(self.label_files + self.img_files)
-        x['results'] = [nf, nm, ne, nc, i + 1]
+        x['results'] = nf, nm, ne, nc, i + 1
+        x['version'] = 0.1  # cache version
         torch.save(x, path)  # save for next time
         logging.info(f'{prefix}New cache created: {path}')
         return x
@@ -650,7 +663,7 @@ def hist_equalize(img, clahe=True, bgr=False):
 def load_mosaic(self, index):
     # loads images in a 4-mosaic
 
-    labels4 = []
+    labels4, segments4 = [], []
     s = self.img_size
     yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border]  # mosaic center x, y
     indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)]  # 3 additional image indices
@@ -678,19 +691,21 @@ def load_mosaic(self, index):
         padh = y1a - y1b
 
         # Labels
-        labels = self.labels[index].copy()
+        labels, segments = self.labels[index].copy(), self.segments[index].copy()
         if labels.size:
             labels[:, 1:] = xywhn2xyxy(labels[:, 1:], w, h, padw, padh)  # normalized xywh to pixel xyxy format
+            segments = [xyn2xy(x, w, h, padw, padh) for x in segments]
         labels4.append(labels)
+        segments4.extend(segments)
 
     # Concat/clip labels
-    if len(labels4):
-        labels4 = np.concatenate(labels4, 0)
-        np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:])  # use with random_perspective
-        # img4, labels4 = replicate(img4, labels4)  # replicate
+    labels4 = np.concatenate(labels4, 0)
+    for x in (labels4[:, 1:], *segments4):
+        np.clip(x, 0, 2 * s, out=x)  # clip when using random_perspective()
+    # img4, labels4 = replicate(img4, labels4)  # replicate
 
     # Augment
-    img4, labels4 = random_perspective(img4, labels4,
+    img4, labels4 = random_perspective(img4, labels4, segments4,
                                        degrees=self.hyp['degrees'],
                                        translate=self.hyp['translate'],
                                        scale=self.hyp['scale'],
@@ -704,7 +719,7 @@ def load_mosaic(self, index):
 def load_mosaic9(self, index):
     # loads images in a 9-mosaic
 
-    labels9 = []
+    labels9, segments9 = [], []
     s = self.img_size
     indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(8)]  # 8 additional image indices
     for i, index in enumerate(indices):
@@ -737,30 +752,34 @@ def load_mosaic9(self, index):
         x1, y1, x2, y2 = [max(x, 0) for x in c]  # allocate coords
 
         # Labels
-        labels = self.labels[index].copy()
+        labels, segments = self.labels[index].copy(), self.segments[index].copy()
         if labels.size:
             labels[:, 1:] = xywhn2xyxy(labels[:, 1:], w, h, padx, pady)  # normalized xywh to pixel xyxy format
+            segments = [xyn2xy(x, w, h, padx, pady) for x in segments]
         labels9.append(labels)
+        segments9.extend(segments)
 
         # Image
         img9[y1:y2, x1:x2] = img[y1 - pady:, x1 - padx:]  # img9[ymin:ymax, xmin:xmax]
         hp, wp = h, w  # height, width previous
 
     # Offset
-    yc, xc = [int(random.uniform(0, s)) for x in self.mosaic_border]  # mosaic center x, y
+    yc, xc = [int(random.uniform(0, s)) for _ in self.mosaic_border]  # mosaic center x, y
     img9 = img9[yc:yc + 2 * s, xc:xc + 2 * s]
 
     # Concat/clip labels
-    if len(labels9):
-        labels9 = np.concatenate(labels9, 0)
-        labels9[:, [1, 3]] -= xc
-        labels9[:, [2, 4]] -= yc
+    labels9 = np.concatenate(labels9, 0)
+    labels9[:, [1, 3]] -= xc
+    labels9[:, [2, 4]] -= yc
+    c = np.array([xc, yc])  # centers
+    segments9 = [x - c for x in segments9]
 
-        np.clip(labels9[:, 1:], 0, 2 * s, out=labels9[:, 1:])  # use with random_perspective
-        # img9, labels9 = replicate(img9, labels9)  # replicate
+    for x in (labels9[:, 1:], *segments9):
+        np.clip(x, 0, 2 * s, out=x)  # clip when using random_perspective()
+    # img9, labels9 = replicate(img9, labels9)  # replicate
 
     # Augment
-    img9, labels9 = random_perspective(img9, labels9,
+    img9, labels9 = random_perspective(img9, labels9, segments9,
                                        degrees=self.hyp['degrees'],
                                        translate=self.hyp['translate'],
                                        scale=self.hyp['scale'],
@@ -821,7 +840,8 @@ def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scale
     return img, ratio, (dw, dh)
 
 
-def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)):
+def random_perspective(img, targets=(), segments=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0,
+                       border=(0, 0)):
     # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
     # targets = [cls, xyxy]
 
@@ -873,37 +893,38 @@ def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shea
     # Transform label coordinates
     n = len(targets)
     if n:
-        # warp points
-        xy = np.ones((n * 4, 3))
-        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
-        xy = xy @ M.T  # transform
-        if perspective:
-            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
-        else:  # affine
-            xy = xy[:, :2].reshape(n, 8)
-
-        # create new boxes
-        x = xy[:, [0, 2, 4, 6]]
-        y = xy[:, [1, 3, 5, 7]]
-        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
-
-        # # apply angle-based reduction of bounding boxes
-        # radians = a * math.pi / 180
-        # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
-        # x = (xy[:, 2] + xy[:, 0]) / 2
-        # y = (xy[:, 3] + xy[:, 1]) / 2
-        # w = (xy[:, 2] - xy[:, 0]) * reduction
-        # h = (xy[:, 3] - xy[:, 1]) * reduction
-        # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
-
-        # clip boxes
-        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
-        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+        use_segments = any(x.any() for x in segments)
+        new = np.zeros((n, 4))
+        if use_segments:  # warp segments
+            segments = resample_segments(segments)  # upsample
+            for i, segment in enumerate(segments):
+                xy = np.ones((len(segment), 3))
+                xy[:, :2] = segment
+                xy = xy @ M.T  # transform
+                xy = xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2]  # perspective rescale or affine
+
+                # clip
+                new[i] = segment2box(xy, width, height)
+
+        else:  # warp boxes
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            xy = xy @ M.T  # transform
+            xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2]).reshape(n, 8)  # perspective rescale or affine
+
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            new = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+            # clip
+            new[:, [0, 2]] = new[:, [0, 2]].clip(0, width)
+            new[:, [1, 3]] = new[:, [1, 3]].clip(0, height)
 
         # filter candidates
-        i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T)
+        i = box_candidates(box1=targets[:, 1:5].T * s, box2=new.T, area_thr=0.01 if use_segments else 0.10)
         targets = targets[i]
-        targets[:, 1:5] = xy[i]
+        targets[:, 1:5] = new[i]
 
     return img, targets
 
diff --git a/utils/general.py b/utils/general.py
index bbc0f32b8425..e5bbc50c6177 100755
--- a/utils/general.py
+++ b/utils/general.py
@@ -47,11 +47,16 @@ def get_latest_run(search_dir='.'):
     return max(last_list, key=os.path.getctime) if last_list else ''
 
 
+def isdocker():
+    # Is environment a Docker container
+    return Path('/workspace').exists()  # or Path('/.dockerenv').exists()
+
+
 def check_online():
     # Check internet connectivity
     import socket
     try:
-        socket.create_connection(("1.1.1.1", 53))  # check host accesability
+        socket.create_connection(("1.1.1.1", 443), 5)  # check host accesability
         return True
     except OSError:
         return False
@@ -62,7 +67,7 @@ def check_git_status():
     print(colorstr('github: '), end='')
     try:
         assert Path('.git').exists(), 'skipping check (not a git repository)'
-        assert not Path('/workspace').exists(), 'skipping check (Docker image)'  # not Path('/.dockerenv').exists()
+        assert not isdocker(), 'skipping check (Docker image)'
         assert check_online(), 'skipping check (offline)'
 
         cmd = 'git fetch && git config --get remote.origin.url'
@@ -95,6 +100,20 @@ def check_img_size(img_size, s=32):
     return new_size
 
 
+def check_imshow():
+    # Check if environment supports image displays
+    try:
+        assert not isdocker(), 'cv2.imshow() is disabled in Docker environments'
+        cv2.imshow('test', np.zeros((1, 1, 3)))
+        cv2.waitKey(1)
+        cv2.destroyAllWindows()
+        cv2.waitKey(1)
+        return True
+    except Exception as e:
+        print(f'WARNING: Environment does not support cv2.imshow() or PIL Image.show() image displays\n{e}')
+        return False
+
+
 def check_file(file):
     # Search for file if not found
     if os.path.isfile(file) or file == '':
@@ -225,7 +244,7 @@ def xywh2xyxy(x):
     return y
 
 
-def xywhn2xyxy(x, w=640, h=640, padw=32, padh=32):
+def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
     # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
     y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
     y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
@@ -235,6 +254,40 @@ def xywhn2xyxy(x, w=640, h=640, padw=32, padh=32):
     return y
 
 
+def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * x[:, 0] + padw  # top left x
+    y[:, 1] = h * x[:, 1] + padh  # top left y
+    return y
+
+
+def segment2box(segment, width=640, height=640):
+    # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy)
+    x, y = segment.T  # segment xy
+    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+    x, y, = x[inside], y[inside]
+    return np.array([x.min(), y.min(), x.max(), y.max()]) if any(x) else np.zeros((1, 4))  # cls, xyxy
+
+
+def segments2boxes(segments):
+    # Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh)
+    boxes = []
+    for s in segments:
+        x, y = s.T  # segment xy
+        boxes.append([x.min(), y.min(), x.max(), y.max()])  # cls, xyxy
+    return xyxy2xywh(np.array(boxes))  # cls, xywh
+
+
+def resample_segments(segments, n=1000):
+    # Up-sample an (n,2) segment
+    for i, s in enumerate(segments):
+        x = np.linspace(0, len(s) - 1, n)
+        xp = np.arange(len(s))
+        segments[i] = np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)]).reshape(2, -1).T  # segment xy
+    return segments
+
+
 def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
     # Rescale coords (xyxy) from img1_shape to img0_shape
     if ratio_pad is None:  # calculate from img0_shape
@@ -337,11 +390,12 @@ def wh_iou(wh1, wh2):
     return inter / (wh1.prod(2) + wh2.prod(2) - inter)  # iou = inter / (area1 + area2 - inter)
 
 
-def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, labels=()):
-    """Performs Non-Maximum Suppression (NMS) on inference results
+def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
+                        labels=()):
+    """Runs Non-Maximum Suppression (NMS) on inference results
 
     Returns:
-         detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
     """
 
     nc = prediction.shape[2] - 5  # number of classes
@@ -353,7 +407,7 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=Non
     max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
     time_limit = 10.0  # seconds to quit after
     redundant = True  # require redundant detections
-    multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
     merge = False  # use merge-NMS
 
     t = time.time()
@@ -430,8 +484,8 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=Non
 def strip_optimizer(f='weights/best.pt', s=''):  # from utils.general import *; strip_optimizer()
     # Strip optimizer from 'f' to finalize training, optionally save as 's'
     x = torch.load(f, map_location=torch.device('cpu'))
-    for key in 'optimizer', 'training_results', 'wandb_id':
-        x[key] = None
+    for k in 'optimizer', 'training_results', 'wandb_id', 'ema':  # keys
+        x[k] = None
     x['epoch'] = -1
     x['model'].half()  # to FP16
     for p in x['model'].parameters():
diff --git a/utils/loss.py b/utils/loss.py
index 889ddf7295da..2302d18de87d 100644
--- a/utils/loss.py
+++ b/utils/loss.py
@@ -105,9 +105,8 @@ def __init__(self, model, autobalance=False):
             BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
 
         det = model.module.model[-1] if is_parallel(model) else model.model[-1]  # Detect() module
-        self.balance = {3: [3.67, 1.0, 0.43], 4: [3.78, 1.0, 0.39, 0.22], 5: [3.88, 1.0, 0.37, 0.17, 0.10]}[det.nl]
-        # self.balance = [1.0] * det.nl
-        self.ssi = (det.stride == 16).nonzero(as_tuple=False).item()  # stride 16 index
+        self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02])  # P3-P7
+        self.ssi = list(det.stride).index(16) if autobalance else 0  # stride 16 index
         self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, model.gr, h, autobalance
         for k in 'na', 'nc', 'nl', 'anchors':
             setattr(self, k, getattr(det, k))
diff --git a/utils/plots.py b/utils/plots.py
index 3ec793528fe5..aa9a1cab81f0 100644
--- a/utils/plots.py
+++ b/utils/plots.py
@@ -15,7 +15,7 @@
 import seaborn as sns
 import torch
 import yaml
-from PIL import Image, ImageDraw
+from PIL import Image, ImageDraw, ImageFont
 from scipy.signal import butter, filtfilt
 
 from utils.general import xywh2xyxy, xyxy2xywh
@@ -54,7 +54,7 @@ def butter_lowpass(cutoff, fs, order):
     return filtfilt(b, a, data)  # forward-backward filter
 
 
-def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+def plot_one_box(x, img, color=None, label=None, line_thickness=3):
     # Plots one bounding box on image img
     tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
     color = color or [random.randint(0, 255) for _ in range(3)]
@@ -68,6 +68,20 @@ def plot_one_box(x, img, color=None, label=None, line_thickness=None):
         cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
 
 
+def plot_one_box_PIL(box, img, color=None, label=None, line_thickness=None):
+    img = Image.fromarray(img)
+    draw = ImageDraw.Draw(img)
+    line_thickness = line_thickness or max(int(min(img.size) / 200), 2)
+    draw.rectangle(box, width=line_thickness, outline=tuple(color))  # plot
+    if label:
+        fontsize = max(round(max(img.size) / 40), 12)
+        font = ImageFont.truetype("Arial.ttf", fontsize)
+        txt_width, txt_height = font.getsize(label)
+        draw.rectangle([box[0], box[1] - txt_height + 4, box[0] + txt_width, box[1]], fill=tuple(color))
+        draw.text((box[0], box[1] - txt_height + 1), label, fill=(255, 255, 255), font=font)
+    return np.asarray(img)
+
+
 def plot_wh_methods():  # from utils.plots import *; plot_wh_methods()
     # Compares the two methods for width-height anchor multiplication
     # https://github.com/ultralytics/yolov3/issues/168
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index 2cb09e71ce71..1b1cc2038c55 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -205,7 +205,7 @@ def model_info(model, verbose=False, img_size=640):
 
     try:  # FLOPS
         from thop import profile
-        stride = int(model.stride.max()) if hasattr(model, 'stride') else 32
+        stride = max(int(model.stride.max()), 32) if hasattr(model, 'stride') else 32
         img = torch.zeros((1, model.yaml.get('ch', 3), stride, stride), device=next(model.parameters()).device)  # input
         flops = profile(deepcopy(model), inputs=(img,), verbose=False)[0] / 1E9 * 2  # stride GFLOPS
         img_size = img_size if isinstance(img_size, list) else [img_size, img_size]  # expand if int/float