From e28ebae3e8d9404b510ac9d779b5c497de4b066d Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Fri, 9 Apr 2021 18:19:49 +0200
Subject: [PATCH] torch.cuda.amp bug fix (#2750)

PR https://github.com/ultralytics/yolov5/pull/2725 introduced a very specific bug that only affects multi-GPU trainings. Apparently the cause was using the torch.cuda.amp decorator in the autoShape forward method. I've implemented amp more traditionally in this PR, and the bug is resolved.

(cherry picked from commit b5de52c4cdfefb3c7acfbff7d7f450a46b4aaada)
---
 models/common.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/models/common.py b/models/common.py
index c77ecbeceace..1130471e904b 100644
--- a/models/common.py
+++ b/models/common.py
@@ -10,6 +10,7 @@
 import torch
 import torch.nn as nn
 from PIL import Image
+from torch.cuda import amp
 
 from utils.datasets import letterbox
 from utils.general import non_max_suppression, make_divisible, scale_coords, increment_path, xyxy2xywh
@@ -237,7 +238,6 @@ def autoshape(self):
         return self
 
     @torch.no_grad()
-    @torch.cuda.amp.autocast(torch.cuda.is_available())
     def forward(self, imgs, size=640, augment=False, profile=False):
         # Inference from various sources. For height=640, width=1280, RGB images example inputs are:
         #   filename:   imgs = 'data/samples/zidane.jpg'
@@ -251,7 +251,8 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         t = [time_synchronized()]
         p = next(self.model.parameters())  # for device and type
         if isinstance(imgs, torch.Tensor):  # torch
-            return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference
+            with amp.autocast(enabled=p.device.type != 'cpu'):
+                return self.model(imgs.to(p.device).type_as(p), augment, profile)  # inference
 
         # Pre-process
         n, imgs = (len(imgs), imgs) if isinstance(imgs, list) else (1, [imgs])  # number of images, list of images
@@ -278,17 +279,18 @@ def forward(self, imgs, size=640, augment=False, profile=False):
         x = torch.from_numpy(x).to(p.device).type_as(p) / 255.  # uint8 to fp16/32
         t.append(time_synchronized())
 
-        # Inference
-        y = self.model(x, augment, profile)[0]  # forward
-        t.append(time_synchronized())
+        with amp.autocast(enabled=p.device.type != 'cpu'):
+            # Inference
+            y = self.model(x, augment, profile)[0]  # forward
+            t.append(time_synchronized())
 
-        # Post-process
-        y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)  # NMS
-        for i in range(n):
-            scale_coords(shape1, y[i][:, :4], shape0[i])
+            # Post-process
+            y = non_max_suppression(y, conf_thres=self.conf, iou_thres=self.iou, classes=self.classes)  # NMS
+            for i in range(n):
+                scale_coords(shape1, y[i][:, :4], shape0[i])
 
-        t.append(time_synchronized())
-        return Detections(imgs, y, files, t, self.names, x.shape)
+            t.append(time_synchronized())
+            return Detections(imgs, y, files, t, self.names, x.shape)
 
 
 class Detections: