From b8d4ee59f02cb1898654c0d0ca6ddc85133086ce Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Sat, 12 Mar 2022 13:17:51 +0100
Subject: [PATCH 1/5] Update common.py speed improvements

Eliminate .to() ops where possible for reduced data transfer overhead. Primarily affects warmup and PyTorch Hub inference.
---
 models/common.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/common.py b/models/common.py
index 48cf55795dd4..4983db33d89b 100644
--- a/models/common.py
+++ b/models/common.py
@@ -466,7 +466,7 @@ def warmup(self, imgsz=(1, 3, 640, 640)):
         # Warmup model by running inference once
         if self.pt or self.jit or self.onnx or self.engine:  # warmup types
             if isinstance(self.device, torch.device) and self.device.type != 'cpu':  # only warmup GPU models
-                im = torch.zeros(*imgsz).to(self.device).type(torch.half if self.fp16 else torch.float)  # input image
+                im = torch.zeros(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
                 self.forward(im)  # warmup
 
     @staticmethod
@@ -527,7 +527,7 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         autocast = self.amp and (p.device.type != 'cpu')  # Automatic Mixed Precision (AMP) inference
         if isinstance(imgs, torch.Tensor):  # torch
             with amp.autocast(enabled=autocast):
-                return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference
+                return self.model(imgs.to(p.device, dtype=p.dtype), augment, profile)  # inference
 
         # Pre-process
         n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])  # number of images, list of images
@@ -551,7 +551,7 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         shape1 = [make_divisible(x, self.stride) if self.pt else size for x in np.array(shape1).max(0)]  # inf shape
         x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs]  # pad
         x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2)))  # stack and BHWC to BCHW
-        x = torch.from_numpy(x).to(p.device).type_as(p) / 255  # uint8 to fp16/32
+        x = torch.tensor(x, device=p.device, dtype=p.dtype) / 255  # uint8 to fp16/32
         t.append(time_sync())
 
         with amp.autocast(enabled=autocast):

From 5ddd304e5148496c7db81e00e1d491cce25207b7 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Sat, 12 Mar 2022 13:36:38 +0100
Subject: [PATCH 2/5] Updates

---
 detect.py | 2 +-
 val.py    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/detect.py b/detect.py
index ccb9fbf5103f..6f3750ecbc90 100644
--- a/detect.py
+++ b/detect.py
@@ -109,7 +109,7 @@ def run(weights=ROOT / 'yolov5s.pt',  # model.pt path(s)
     dt, seen = [0.0, 0.0, 0.0], 0
     for path, im, im0s, vid_cap, s in dataset:
         t1 = time_sync()
-        im = torch.from_numpy(im).to(device)
+        im = torch.tensor(im, device=device)
         im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
         im /= 255  # 0 - 255 to 0.0 - 1.0
         if len(im.shape) == 3:
diff --git a/val.py b/val.py
index 8f2119531949..25a46ac106e9 100644
--- a/val.py
+++ b/val.py
@@ -87,7 +87,7 @@ def process_batch(detections, labels, iouv):
             matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
             # matches = matches[matches[:, 2].argsort()[::-1]]
             matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
-        matches = torch.Tensor(matches).to(iouv.device)
+        matches = torch.tensor(matches, device=iouv.device)
         correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv
     return correct
 
@@ -155,7 +155,7 @@ def run(data,
     cuda = device.type != 'cpu'
     is_coco = isinstance(data.get('val'), str) and data['val'].endswith('coco/val2017.txt')  # COCO dataset
     nc = 1 if single_cls else int(data['nc'])  # number of classes
-    iouv = torch.linspace(0.5, 0.95, 10).to(device)  # iou vector for mAP@0.5:0.95
+    iouv = torch.linspace(0.5, 0.95, 10, device=device)  # iou vector for mAP@0.5:0.95
     niou = iouv.numel()
 
     # Dataloader
@@ -196,7 +196,7 @@ def run(data,
             loss += compute_loss([x.float() for x in train_out], targets)[1]  # box, obj, cls
 
         # NMS
-        targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device)  # to pixels
+        targets[:, 2:] *= torch.tensor((width, height, width, height), device=device)  # to pixels
         lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else []  # for autolabelling
         t3 = time_sync()
         out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls)

From fb066cda88f49158c2cc32c98e79f9616f024c61 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Sat, 12 Mar 2022 13:44:53 +0100
Subject: [PATCH 3/5] Updates

---
 models/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/common.py b/models/common.py
index 4983db33d89b..83aecb7569d6 100644
--- a/models/common.py
+++ b/models/common.py
@@ -527,7 +527,7 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         autocast = self.amp and (p.device.type != 'cpu')  # Automatic Mixed Precision (AMP) inference
         if isinstance(imgs, torch.Tensor):  # torch
             with amp.autocast(enabled=autocast):
-                return self.model(imgs.to(p.device, dtype=p.dtype), augment, profile)  # inference
+                return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference
 
         # Pre-process
         n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])  # number of images, list of images
@@ -551,7 +551,7 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         shape1 = [make_divisible(x, self.stride) if self.pt else size for x in np.array(shape1).max(0)]  # inf shape
         x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs]  # pad
         x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2)))  # stack and BHWC to BCHW
-        x = torch.tensor(x, device=p.device, dtype=p.dtype) / 255  # uint8 to fp16/32
+        x = torch.from_numpy(x).to(p.device).type_as(p) / 255  # uint8 to fp16/32
         t.append(time_sync())
 
         with amp.autocast(enabled=autocast):

From 7d20e3ca1cec56fd300aebe52242a5b74aac5edf Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Sat, 12 Mar 2022 13:46:49 +0100
Subject: [PATCH 4/5] Update detect.py

---
 detect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/detect.py b/detect.py
index 6f3750ecbc90..ccb9fbf5103f 100644
--- a/detect.py
+++ b/detect.py
@@ -109,7 +109,7 @@ def run(weights=ROOT / 'yolov5s.pt',  # model.pt path(s)
     dt, seen = [0.0, 0.0, 0.0], 0
     for path, im, im0s, vid_cap, s in dataset:
         t1 = time_sync()
-        im = torch.tensor(im, device=device)
+        im = torch.from_numpy(im).to(device)
         im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
         im /= 255  # 0 - 255 to 0.0 - 1.0
         if len(im.shape) == 3:

From 4f3bbea14718538f7603460787a5c9ac39773062 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Sat, 12 Mar 2022 13:55:36 +0100
Subject: [PATCH 5/5] Update val.py

---
 val.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/val.py b/val.py
index 25a46ac106e9..2dd2aec679f9 100644
--- a/val.py
+++ b/val.py
@@ -87,7 +87,7 @@ def process_batch(detections, labels, iouv):
             matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
             # matches = matches[matches[:, 2].argsort()[::-1]]
             matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
-        matches = torch.tensor(matches, device=iouv.device)
+        matches = torch.from_numpy(matches).to(iouv.device)
         correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv
     return correct