diff --git a/yolort/v5/helper.py b/yolort/v5/helper.py
index 40642f00..6b543555 100644
--- a/yolort/v5/helper.py
+++ b/yolort/v5/helper.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from .models import AutoShape
 from .models.yolo import Model
 from .utils import attempt_download, intersect_dicts, set_logging
 
@@ -68,6 +69,6 @@ def load_yolov5_model(checkpoint_path: str, autoshape: bool = False, verbose: bo
     model.load_state_dict(ckpt_state_dict, strict=False)
 
     if autoshape:
-        model = model.autoshape()
+        model = AutoShape(model)
 
     return model
diff --git a/yolort/v5/models/common.py b/yolort/v5/models/common.py
index c6a7ce52..5021d14a 100644
--- a/yolort/v5/models/common.py
+++ b/yolort/v5/models/common.py
@@ -26,7 +26,7 @@
     xyxy2xywh,
 )
 from yolort.v5.utils.plots import Annotator, colors, save_one_box
-from yolort.v5.utils.torch_utils import time_sync
+from yolort.v5.utils.torch_utils import copy_attr, time_sync
 
 LOGGER = logging.getLogger(__name__)
 
@@ -419,32 +419,44 @@ class AutoShape(nn.Module):
 
     conf = 0.25  # NMS confidence threshold
     iou = 0.45  # NMS IoU threshold
-    classes = None  # (optional list) filter by class
+    # (optional list) filter by class, i.e. = [0, 15, 16] for COCO persons, cats and dogs
+    classes = None
     multi_label = False  # NMS multiple labels per box
     max_det = 1000  # maximum number of detections per image
 
     def __init__(self, model):
         super().__init__()
+        LOGGER.info("Adding AutoShape... ")
+        # copy attributes
+        copy_attr(self, model, include=("yaml", "nc", "hyp", "names", "stride", "abc"), exclude=())
         self.model = model.eval()
 
-    def autoshape(self):
-        # model already converted to model.autoshape()
-        LOGGER.info("AutoShape already enabled, skipping... ")
+    def _apply(self, fn):
+        """
+        Apply to(), cpu(), cuda(), half() to model tensors that
+        are not parameters or registered buffers
+        """
+        self = super()._apply(fn)
+        m = self.model.model[-1]  # Detect()
+        m.stride = fn(m.stride)
+        m.grid = list(map(fn, m.grid))
+        if isinstance(m.anchor_grid, list):
+            m.anchor_grid = list(map(fn, m.anchor_grid))
         return self
 
     @torch.no_grad()
     def forward(self, imgs, size=640, augment=False, profile=False):
+        """
+        Inference from various sources. For height=640, width=1280, RGB images example inputs are:
+            - file: imgs = 'data/images/zidane.jpg'  # str or PosixPath
+            - URI: = 'https://ultralytics.com/images/zidane.jpg'
+            - OpenCV: = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(640,1280,3)
+            - PIL: = Image.open('image.jpg') or ImageGrab.grab()  # HWC x(640,1280,3)
+            - numpy: = np.zeros((640,1280,3))  # HWC
+            - torch: = torch.zeros(16,3,320,640)  # BCHW (scaled to size=640, 0-1 values)
+            - multiple: = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...]  # list of images
+        """
         from yolort.v5.utils.augmentations import letterbox
-
-        # Inference from various sources. For height=640, width=1280, RGB images example inputs are:
-        #   file:       imgs = 'data/images/zidane.jpg'  # str or PosixPath
-        #   URI:             = 'https://ultralytics.com/images/zidane.jpg'
-        #   OpenCV:          = cv2.imread('image.jpg')[:,:,::-1]  # HWC BGR to RGB x(640,1280,3)
-        #   PIL:             = Image.open('image.jpg') or ImageGrab.grab()  # HWC x(640,1280,3)
-        #   numpy:           = np.zeros((640,1280,3))  # HWC
-        #   torch:           = torch.zeros(16,3,320,640)  # BCHW (scaled to size=640, 0-1 values)
-        #   multiple:        = [Image.open('image1.jpg'), Image.open('image2.jpg'), ...]  # list of images
-
         from yolort.v5.utils.datasets import exif_transpose
 
         t = [time_sync()]
@@ -454,10 +466,10 @@ def forward(self, imgs, size=640, augment=False, profile=False):
                 return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference
 
         # Pre-process
-        n, imgs = (
-            (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])
-        )  # number of images, list of images
-        shape0, shape1, files = [], [], []  # image and inference shapes, filenames
+        # number of images, list of images
+        n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])
+        # image and inference shapes, filenames
+        shape0, shape1, files = [], [], []
         for i, im in enumerate(imgs):
             f = f"image{i}"  # filename
             if isinstance(im, (str, Path)):  # filename or uri
@@ -482,7 +494,7 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs]  # pad
         x = np.stack(x, 0) if n > 1 else x[0][None]  # stack
         x = np.ascontiguousarray(x.transpose((0, 3, 1, 2)))  # BHWC to BCHW
-        x = torch.from_numpy(x).to(p.device).type_as(p) / 255.0  # uint8 to fp16/32
+        x = torch.from_numpy(x).to(p.device).type_as(p) / 255  # uint8 to fp16/32
         t.append(time_sync())
 
         with amp.autocast(enabled=p.device.type != "cpu"):
@@ -498,7 +510,7 @@ def forward(self, imgs, size=640, augment=False, profile=False):
                 classes=self.classes,
                 multi_label=self.multi_label,
                 max_det=self.max_det,
-            )  # NMS
+            )
             for i in range(n):
                 scale_coords(shape1, y[i][:, :4], shape0[i])
 
diff --git a/yolort/v5/utils/torch_utils.py b/yolort/v5/utils/torch_utils.py
index 54f4a8d0..ba34dad2 100644
--- a/yolort/v5/utils/torch_utils.py
+++ b/yolort/v5/utils/torch_utils.py
@@ -1,10 +1,9 @@
-# YOLOv5 by Ultralytics, GPL-3.0 license
+# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
 """
 PyTorch utils
 """
 
 import datetime
-import logging
 import math
 import os
 import platform
@@ -15,19 +14,17 @@
 from pathlib import Path
 
 import torch
-import torch.backends.cudnn as cudnn
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
-import torchvision
+
+from .general import LOGGER
 
 try:
     import thop  # for FLOPs computation
 except ImportError:
     thop = None
 
-LOGGER = logging.getLogger(__name__)
-
 
 @contextmanager
 def torch_distributed_zero_first(local_rank: int):
@@ -42,16 +39,6 @@ def torch_distributed_zero_first(local_rank: int):
         dist.barrier(device_ids=[0])
 
 
-def init_torch_seeds(seed=0):
-    # Speed-reproducibility tradeoff
-    # https://pytorch.org/docs/stable/notes/randomness.html
-    torch.manual_seed(seed)
-    if seed == 0:  # slower, more reproducible
-        cudnn.benchmark, cudnn.deterministic = False, True
-    else:  # faster, less reproducible
-        cudnn.benchmark, cudnn.deterministic = True, False
-
-
 def date_modified(path=__file__):
     # return human-readable file modification date, i.e. '2021-3-26'
     t = datetime.datetime.fromtimestamp(Path(path).stat().st_mtime)
@@ -59,27 +46,27 @@ def date_modified(path=__file__):
 
 
 def git_describe(path=Path(__file__).parent):
-    # path must be a directory
-    # return human-readable git description,
-    # i.e. v5.0-5-g3e25f1e https://git-scm.com/docs/git-describe
+    """
+    Return human-readable git description,
+    i.e. v5.0-5-g3e25f1e https://git-scm.com/docs/git-describe
+    """
     s = f"git -C {path} describe --tags --long --always"
     try:
         return subprocess.check_output(s, shell=True, stderr=subprocess.STDOUT).decode()[:-1]
-    except subprocess.CalledProcessError:
-        return ""  # not a git repository
+    except subprocess.CalledProcessError as e:
+        print(f"Wraning, not a git repository: {e}")
+        return ""
 
 
-def select_device(device="", batch_size=None):
+def select_device(device="", batch_size=None, newline=True):
     # device = 'cpu' or '0' or '0,1,2,3'
-    s = f"YOLOv5 {git_describe() or date_modified()} torch {torch.__version__} "  # string
+    s = f"YOLOv5 🚀 {git_describe() or date_modified()} torch {torch.__version__} "  # string
     device = str(device).strip().lower().replace("cuda:", "")  # to string, 'cuda:0' to '0'
     cpu = device == "cpu"
     if cpu:
-        # force torch.cuda.is_available() = False
-        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # force torch.cuda.is_available() = False
     elif device:  # non-cpu device requested
-        # set environment variable
-        os.environ["CUDA_VISIBLE_DEVICES"] = device
+        os.environ["CUDA_VISIBLE_DEVICES"] = device  # set environment variable
         # check availability
         assert torch.cuda.is_available(), f"CUDA unavailable, invalid device {device} requested"
 
@@ -94,10 +81,12 @@ def select_device(device="", batch_size=None):
         for i, d in enumerate(devices):
             p = torch.cuda.get_device_properties(i)
             # bytes to MB
-            s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / 1024 ** 2}MB)\n"
+            s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / 1024 ** 2:.0f}MiB)\n"
     else:
         s += "CPU\n"
 
+    if not newline:
+        s = s.rstrip()
     # emoji-safe
     LOGGER.info(s.encode().decode("ascii", "ignore") if platform.system() == "Windows" else s)
     return torch.device("cuda:0" if cuda else "cpu")
@@ -111,16 +100,21 @@ def time_sync():
 
 
 def profile(input, ops, n=10, device=None):
-    # YOLOv5 speed/memory/FLOPs profiler
-    #
-    # Usage:
-    #     input = torch.randn(16, 3, 640, 640)
-    #     m1 = lambda x: x * torch.sigmoid(x)
-    #     m2 = nn.SiLU()
-    #     profile(input, [m1, m2], n=100)  # profile over 100 iterations
+    """
+    YOLOv5 speed/memory/FLOPs profiler
+
+    Example:
+
+        from yolort.v5.utils.torch_utils import profile
+
+        input = torch.randn(16, 3, 640, 640)
+        m1 = lambda x: x * torch.sigmoid(x)
+        m2 = nn.SiLU()
+        # profile over 100 iterations
+        profile(input, [m1, m2], n=100)
+    """
 
     results = []
-    logging.basicConfig(format="%(message)s", level=logging.INFO)
     device = device or select_device()
     print(
         f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}"
@@ -138,13 +132,12 @@ def profile(input, ops, n=10, device=None):
                 if hasattr(m, "half") and isinstance(x, torch.Tensor) and x.dtype is torch.float16
                 else m
             )
-            # dt forward, backward
-            tf, tb, t = 0.0, 0.0, [0.0, 0.0, 0.0]
-            if thop is None:
+            tf, tb, t = 0, 0, [0, 0, 0]  # dt forward, backward
+            try:
+                flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1e9 * 2  # GFLOPs
+            except Exception as e:
+                print(f"Warning: {e}")
                 flops = 0
-            else:
-                # GFLOPs
-                flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1e9 * 2
 
             try:
                 for _ in range(n):
@@ -152,25 +145,19 @@ def profile(input, ops, n=10, device=None):
                     y = m(x)
                     t[1] = time_sync()
                     try:
-                        _ = (sum([yi.sum() for yi in y]) if isinstance(y, list) else y).sum().backward()
+                        _ = (sum(yi.sum() for yi in y) if isinstance(y, list) else y).sum().backward()
                         t[2] = time_sync()
-                    # no backward method
-                    except Exception as e:
-                        print(e)
+                    except Exception as e:  # no backward method
+                        print(f"Warning: {e}")
                         t[2] = float("nan")
-                    # ms per op forward
-                    tf += (t[1] - t[0]) * 1000 / n
-                    # ms per op backward
-                    tb += (t[2] - t[1]) * 1000 / n
+                    tf += (t[1] - t[0]) * 1000 / n  # ms per op forward
+                    tb += (t[2] - t[1]) * 1000 / n  # ms per op backward
                 mem = torch.cuda.memory_reserved() / 1e9 if torch.cuda.is_available() else 0  # (GB)
                 s_in = tuple(x.shape) if isinstance(x, torch.Tensor) else "list"
                 s_out = tuple(y.shape) if isinstance(y, torch.Tensor) else "list"
                 # parameters
                 p = sum(list(x.numel() for x in m.parameters())) if isinstance(m, nn.Module) else 0
-                print(
-                    f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}"
-                    f"{str(s_in):>24s}{str(s_out):>24s}"
-                )
+                print(f"{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}")
                 results.append([p, flops, mem, tf, tb, s_in, s_out])
             except Exception as e:
                 print(e)
@@ -181,26 +168,16 @@ def profile(input, ops, n=10, device=None):
 
 def is_parallel(model):
     # Returns True if model is of type DP or DDP
-    return type(model) in (
-        nn.parallel.DataParallel,
-        nn.parallel.DistributedDataParallel,
-    )
+    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
 
 
 def de_parallel(model):
-    # De-parallelize a model:
-    # returns single-GPU model if model is of type DP or DDP
+    """
+    De-parallelize a model: returns single-GPU model if model is of type DP or DDP
+    """
     return model.module if is_parallel(model) else model
 
 
-def intersect_dicts(da, db, exclude=()):
-    # Dictionary intersection of matching keys and shapes,
-    # omitting 'exclude' keys, using da values
-    return {
-        k: v for k, v in da.items() if k in db and not any(x in k for x in exclude) and v.shape == db[k].shape
-    }
-
-
 def initialize_weights(model):
     for m in model.modules():
         t = type(m)
@@ -209,7 +186,7 @@ def initialize_weights(model):
         elif t is nn.BatchNorm2d:
             m.eps = 1e-3
             m.momentum = 0.03
-        elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
+        elif t in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU]:
             m.inplace = True
 
 
@@ -220,7 +197,7 @@ def find_modules(model, mclass=nn.Conv2d):
 
 def sparsity(model):
     # Return global model sparsity
-    a, b = 0.0, 0.0
+    a, b = 0, 0
     for p in model.parameters():
         a += p.numel()
         b += (p == 0).sum()
@@ -232,17 +209,18 @@ def prune(model, amount=0.3):
     import torch.nn.utils.prune as prune
 
     print("Pruning model... ", end="")
-    for _, m in model.named_modules():
+    for name, m in model.named_modules():
         if isinstance(m, nn.Conv2d):
-            # prune
-            prune.l1_unstructured(m, name="weight", amount=amount)
+            prune.l1_unstructured(m, name="weight", amount=amount)  # prune
             prune.remove(m, "weight")  # make permanent
     print(" %.3g global sparsity" % sparsity(model))
 
 
 def fuse_conv_and_bn(conv, bn):
-    # Fuse convolution and batchnorm layers
-    # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    """
+    Fuse convolution and batchnorm layers
+    https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    """
     fusedconv = (
         nn.Conv2d(
             conv.in_channels,
@@ -271,23 +249,19 @@ def fuse_conv_and_bn(conv, bn):
 
 
 def model_info(model, verbose=False, img_size=640):
-    # Model information. img_size may be int or list,
-    # i.e. img_size=640 or img_size=[640, 320]
-
-    # number parameters
-    n_p = sum(x.numel() for x in model.parameters())
-    # number gradients
-    n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)
+    # Model information. img_size may be int or list, i.e. img_size=640 or img_size=[640, 320]
+    n_p = sum(x.numel() for x in model.parameters())  # number parameters
+    n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
     if verbose:
         print(
-            "%5s %40s %9s %12s %20s %10s %10s"
-            % ("layer", "name", "gradient", "parameters", "shape", "mu", "sigma")
+            f"{'layer':>5} {'name':>40} {'gradient':>9} {'parameters':>12} "
+            f"{'shape':>20} {'mu':>10} {'sigma':>10}"
         )
         for i, (name, p) in enumerate(model.named_parameters()):
             name = name.replace("module_list.", "")
             print(
-                f"{i:5g} {name:40s} {p.requires_grad:9s} {p.numel():12g} "
-                f"{list(p.shape):20s} {p.mean():10.3g} {p.std():10.3g}"
+                "%5g %40s %9s %12g %20s %10.3g %10.3g"
+                % (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())
             )
 
     try:  # FLOPs
@@ -296,8 +270,7 @@ def model_info(model, verbose=False, img_size=640):
         stride = max(int(model.stride.max()), 32) if hasattr(model, "stride") else 32
         # input
         img = torch.zeros(
-            (1, model.yaml.get("ch", 3), stride, stride),
-            device=next(model.parameters()).device,
+            (1, model.yaml.get("ch", 3), stride, stride), device=next(model.parameters()).device
         )
         # stride GFLOPs
         flops = profile(deepcopy(model), inputs=(img,), verbose=False)[0] / 1e9 * 2
@@ -308,44 +281,24 @@ def model_info(model, verbose=False, img_size=640):
     except (ImportError, Exception):
         fs = ""
 
-    LOGGER.info(f"Model Summary: {len(list(model.modules()))} layers, {n_p} parameters, {n_g} gradients{fs}")
-
-
-def load_classifier(name="resnet101", n=2):
-    # Loads a pretrained model reshaped to n-class output
-    model = torchvision.models.__dict__[name](pretrained=True)
-
-    # ResNet model properties
-    # input_size = [3, 224, 224]
-    # input_space = 'RGB'
-    # input_range = [0, 1]
-    # mean = [0.485, 0.456, 0.406]
-    # std = [0.229, 0.224, 0.225]
-
-    # Reshape output to n classes
-    filters = model.fc.weight.shape[1]
-    model.fc.bias = nn.Parameter(torch.zeros(n), requires_grad=True)
-    model.fc.weight = nn.Parameter(torch.zeros(n, filters), requires_grad=True)
-    model.fc.out_features = n
-    return model
+    LOGGER.info(
+        f"Model Summary: {len(list(model.modules()))} layers, " f"{n_p} parameters, {n_g} gradients{fs}"
+    )
 
 
 def scale_img(img, ratio=1.0, same_shape=False, gs=32):
-    # img(16,3,256,416)
-    # scales img(bs,3,y,x) by ratio constrained to gs-multiple
+    """
+    Scales img(bs,3,y,x) by ratio constrained to gs-multiple
+    """
     if ratio == 1.0:
         return img
     else:
         h, w = img.shape[2:]
-        # new size
-        s = (int(h * ratio), int(w * ratio))
-        # resize
-        img = F.interpolate(img, size=s, mode="bilinear", align_corners=False)
-        # pad/crop img
-        if not same_shape:
-            h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)]
-        # value = imagenet mean
-        return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)
+        s = (int(h * ratio), int(w * ratio))  # new size
+        img = F.interpolate(img, size=s, mode="bilinear", align_corners=False)  # resize
+        if not same_shape:  # pad/crop img
+            h, w = (math.ceil(x * ratio / gs) * gs for x in (h, w))
+        return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)  # value = imagenet mean
 
 
 def copy_attr(a, b, include=(), exclude=()):
@@ -360,21 +313,27 @@ def copy_attr(a, b, include=(), exclude=()):
 class EarlyStopping:
     # YOLOv5 simple early stopper
     def __init__(self, patience=30):
-        # i.e. mAP
-        self.best_fitness = 0.0
+        self.best_fitness = 0.0  # i.e. mAP
         self.best_epoch = 0
         # epochs to wait after fitness stops improving to stop
-        self.patience = patience
+        self.patience = patience or float("inf")
+        self.possible_stop = False  # possible stop may occur next epoch
 
     def __call__(self, epoch, fitness):
-        # >= 0 to allow for early zero-fitness stage of training
-        if fitness >= self.best_fitness:
+        if fitness >= self.best_fitness:  # >= 0 to allow for early zero-fitness stage of training
             self.best_epoch = epoch
             self.best_fitness = fitness
-        # stop training if patience exceeded
-        stop = (epoch - self.best_epoch) >= self.patience
+        delta = epoch - self.best_epoch  # epochs without improvement
+        # possible stop may occur next epoch
+        self.possible_stop = delta >= (self.patience - 1)
+        stop = delta >= self.patience  # stop training if patience exceeded
         if stop:
-            LOGGER.info(f"EarlyStopping patience {self.patience} exceeded, stopping training.")
+            LOGGER.info(
+                f"Stopping training early as no improvement observed in last {self.patience} epochs. "
+                f"Best results observed at epoch {self.best_epoch}, best model saved as best.pt.\n"
+                f"To update EarlyStopping(patience={self.patience}) pass a new patience value, "
+                "i.e. `python train.py --patience 300` or use `--patience 0` to disable EarlyStopping."
+            )
         return stop
 
 
@@ -409,12 +368,12 @@ def update(self, model):
         with torch.no_grad():
             self.updates += 1
             d = self.decay(self.updates)
-            # model state_dict
+
             msd = model.module.state_dict() if is_parallel(model) else model.state_dict()
             for k, v in self.ema.state_dict().items():
                 if v.dtype.is_floating_point:
                     v *= d
-                    v += (1.0 - d) * msd[k].detach()
+                    v += (1 - d) * msd[k].detach()
 
     def update_attr(self, model, include=(), exclude=("process_group", "reducer")):
         # Update EMA attributes