From b8d4ee59f02cb1898654c0d0ca6ddc85133086ce Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 12 Mar 2022 13:17:51 +0100 Subject: [PATCH 1/5] Update common.py speed improvements Eliminate .to() ops where possible for reduced data transfer overhead. Primarily affects warmup and PyTorch Hub inference. --- models/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/models/common.py b/models/common.py index 48cf55795dd4..4983db33d89b 100644 --- a/models/common.py +++ b/models/common.py @@ -466,7 +466,7 @@ def warmup(self, imgsz=(1, 3, 640, 640)): # Warmup model by running inference once if self.pt or self.jit or self.onnx or self.engine: # warmup types if isinstance(self.device, torch.device) and self.device.type != 'cpu': # only warmup GPU models - im = torch.zeros(*imgsz).to(self.device).type(torch.half if self.fp16 else torch.float) # input image + im = torch.zeros(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device) # input self.forward(im) # warmup @staticmethod @@ -527,7 +527,7 @@ def forward(self, imgs, size=640, augment=False, profile=False): autocast = self.amp and (p.device.type != 'cpu') # Automatic Mixed Precision (AMP) inference if isinstance(imgs, torch.Tensor): # torch with amp.autocast(enabled=autocast): - return self.model(imgs.to(p.device).type_as(p), augment, profile) # inference + return self.model(imgs.to(p.device, dtype=p.dtype), augment, profile) # inference # Pre-process n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs]) # number of images, list of images @@ -551,7 +551,7 @@ def forward(self, imgs, size=640, augment=False, profile=False): shape1 = [make_divisible(x, self.stride) if self.pt else size for x in np.array(shape1).max(0)] # inf shape x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs] # pad x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2))) # stack and BHWC to BCHW - x = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32 + x = torch.tensor(x, device=p.device, dtype=p.dtype) / 255 # uint8 to fp16/32 t.append(time_sync()) with amp.autocast(enabled=autocast): From 5ddd304e5148496c7db81e00e1d491cce25207b7 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 12 Mar 2022 13:36:38 +0100 Subject: [PATCH 2/5] Updates --- detect.py | 2 +- val.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/detect.py b/detect.py index ccb9fbf5103f..6f3750ecbc90 100644 --- a/detect.py +++ b/detect.py @@ -109,7 +109,7 @@ def run(weights=ROOT / 'yolov5s.pt', # model.pt path(s) dt, seen = [0.0, 0.0, 0.0], 0 for path, im, im0s, vid_cap, s in dataset: t1 = time_sync() - im = torch.from_numpy(im).to(device) + im = torch.tensor(im, device=device) im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: diff --git a/val.py b/val.py index 8f2119531949..25a46ac106e9 100644 --- a/val.py +++ b/val.py @@ -87,7 +87,7 @@ def process_batch(detections, labels, iouv): matches = matches[np.unique(matches[:, 1], return_index=True)[1]] # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] - matches = torch.Tensor(matches).to(iouv.device) + matches = torch.tensor(matches, device=iouv.device) correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv return correct @@ -155,7 +155,7 @@ def run(data, cuda = device.type != 'cpu' is_coco = isinstance(data.get('val'), str) and data['val'].endswith('coco/val2017.txt') # COCO dataset nc = 1 if single_cls else int(data['nc']) # number of classes - iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 + iouv = torch.linspace(0.5, 0.95, 10, device=device) # iou vector for mAP@0.5:0.95 niou = iouv.numel() # Dataloader @@ -196,7 +196,7 @@ def run(data, loss += compute_loss([x.float() for x in train_out], targets)[1] # box, obj, cls # NMS - targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device) # to pixels + targets[:, 2:] *= torch.tensor((width, height, width, height), device=device) # to pixels lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else [] # for autolabelling t3 = time_sync() out = non_max_suppression(out, conf_thres, iou_thres, labels=lb, multi_label=True, agnostic=single_cls) From fb066cda88f49158c2cc32c98e79f9616f024c61 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 12 Mar 2022 13:44:53 +0100 Subject: [PATCH 3/5] Updates --- models/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/common.py b/models/common.py index 4983db33d89b..83aecb7569d6 100644 --- a/models/common.py +++ b/models/common.py @@ -527,7 +527,7 @@ def forward(self, imgs, size=640, augment=False, profile=False): autocast = self.amp and (p.device.type != 'cpu') # Automatic Mixed Precision (AMP) inference if isinstance(imgs, torch.Tensor): # torch with amp.autocast(enabled=autocast): - return self.model(imgs.to(p.device, dtype=p.dtype), augment, profile) # inference + return self.model(imgs.to(p.device).type_as(p), augment, profile) # inference # Pre-process n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs]) # number of images, list of images @@ -551,7 +551,7 @@ def forward(self, imgs, size=640, augment=False, profile=False): shape1 = [make_divisible(x, self.stride) if self.pt else size for x in np.array(shape1).max(0)] # inf shape x = [letterbox(im, new_shape=shape1, auto=False)[0] for im in imgs] # pad x = np.ascontiguousarray(np.array(x).transpose((0, 3, 1, 2))) # stack and BHWC to BCHW - x = torch.tensor(x, device=p.device, dtype=p.dtype) / 255 # uint8 to fp16/32 + x = torch.from_numpy(x).to(p.device).type_as(p) / 255 # uint8 to fp16/32 t.append(time_sync()) with amp.autocast(enabled=autocast): From 7d20e3ca1cec56fd300aebe52242a5b74aac5edf Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 12 Mar 2022 13:46:49 +0100 Subject: [PATCH 4/5] Update detect.py --- detect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect.py b/detect.py index 6f3750ecbc90..ccb9fbf5103f 100644 --- a/detect.py +++ b/detect.py @@ -109,7 +109,7 @@ def run(weights=ROOT / 'yolov5s.pt', # model.pt path(s) dt, seen = [0.0, 0.0, 0.0], 0 for path, im, im0s, vid_cap, s in dataset: t1 = time_sync() - im = torch.tensor(im, device=device) + im = torch.from_numpy(im).to(device) im = im.half() if model.fp16 else im.float() # uint8 to fp16/32 im /= 255 # 0 - 255 to 0.0 - 1.0 if len(im.shape) == 3: From 4f3bbea14718538f7603460787a5c9ac39773062 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 12 Mar 2022 13:55:36 +0100 Subject: [PATCH 5/5] Update val.py --- val.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/val.py b/val.py index 25a46ac106e9..2dd2aec679f9 100644 --- a/val.py +++ b/val.py @@ -87,7 +87,7 @@ def process_batch(detections, labels, iouv): matches = matches[np.unique(matches[:, 1], return_index=True)[1]] # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] - matches = torch.tensor(matches, device=iouv.device) + matches = torch.from_numpy(matches).to(iouv.device) correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv return correct