ultralytics · glenn-jocher · Aug 17, 2021 · Oct 9, 2020 · Oct 9, 2020 · Oct 20, 2020
diff --git a/detect.py b/detect.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 
 import cv2
+import numpy as np
 import torch
 import torch.backends.cudnn as cudnn
 
@@ -51,6 +52,7 @@ def run(weights='yolov5s.pt',  # model.pt path(s)
         hide_labels=False,  # hide labels
         hide_conf=False,  # hide confidences
         half=False,  # use FP16 half-precision inference
+        tfl_int8=False,  # INT8 quantized TFLite model
         ):
     save_img = not nosave and not source.endswith('.txt')  # save inference images
     webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
@@ -68,7 +70,7 @@ def run(weights='yolov5s.pt',  # model.pt path(s)
     # Load model
     w = weights[0] if isinstance(weights, list) else weights
     classify, suffix = False, Path(w).suffix.lower()
-    pt, onnx, tflite, pb, graph_def = (suffix == x for x in ['.pt', '.onnx', '.tflite', '.pb', ''])  # backend
+    pt, onnx, tflite, pb, saved_model = (suffix == x for x in ['.pt', '.onnx', '.tflite', '.pb', ''])  # backend
     stride, names = 64, [f'class{i}' for i in range(1000)]  # assign defaults
     if pt:
         model = attempt_load(weights, map_location=device)  # load FP32 model
@@ -83,30 +85,49 @@ def run(weights='yolov5s.pt',  # model.pt path(s)
         check_requirements(('onnx', 'onnxruntime'))
         import onnxruntime
         session = onnxruntime.InferenceSession(w, None)
+    else:  # TensorFlow models
+        check_requirements(('tensorflow>=2.4.1',))
+        import tensorflow as tf
+        if pb:  # https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt
+            def wrap_frozen_graph(gd, inputs, outputs):
+                x = tf.compat.v1.wrap_function(lambda: tf.compat.v1.import_graph_def(gd, name=""), [])  # wrapped import
+                return x.prune(tf.nest.map_structure(x.graph.as_graph_element, inputs),
+                               tf.nest.map_structure(x.graph.as_graph_element, outputs))
+
+            graph_def = tf.Graph().as_graph_def()
+            graph_def.ParseFromString(open(w, 'rb').read())
+            frozen_func = wrap_frozen_graph(gd=graph_def, inputs="x:0", outputs="Identity:0")
+        elif saved_model:
+            model = tf.keras.models.load_model(w)
+        elif tflite:
+            interpreter = tf.lite.Interpreter(model_path=w)  # load TFLite model
+            interpreter.allocate_tensors()  # allocate
+            input_details = interpreter.get_input_details()  # inputs
+            output_details = interpreter.get_output_details()  # outputs
     imgsz = check_img_size(imgsz, s=stride)  # check image size
 
     # Dataloader
     if webcam:
         view_img = check_imshow()
         cudnn.benchmark = True  # set True to speed up constant image size inference
-        dataset = LoadStreams(source, img_size=imgsz, stride=stride)
+        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
         bs = len(dataset)  # batch_size
     else:
-        dataset = LoadImages(source, img_size=imgsz, stride=stride)
+        dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
         bs = 1  # batch_size
     vid_path, vid_writer = [None] * bs, [None] * bs
 
     # Run inference
     if pt and device.type != 'cpu':
-        model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
+        model(torch.zeros(1, 3, *imgsz).to(device).type_as(next(model.parameters())))  # run once
     t0 = time.time()
     for path, img, im0s, vid_cap in dataset:
-        if pt:
+        if onnx:
+            img = img.astype('float32')
+        else:
             img = torch.from_numpy(img).to(device)
             img = img.half() if half else img.float()  # uint8 to fp16/32
-        elif onnx:
-            img = img.astype('float32')
-        img /= 255.0  # 0 - 255 to 0.0 - 1.0
+        img = img / 255.0  # 0 - 255 to 0.0 - 1.0
         if len(img.shape) == 3:
             img = img[None]  # expand for batch dim
 
@@ -117,6 +138,27 @@ def run(weights='yolov5s.pt',  # model.pt path(s)
             pred = model(img, augment=augment, visualize=visualize)[0]
         elif onnx:
             pred = torch.tensor(session.run([session.get_outputs()[0].name], {session.get_inputs()[0].name: img}))
+        else:  # tensorflow model (tflite, pb, saved_model)
+            imn = img.permute(0, 2, 3, 1).cpu().numpy()  # image in numpy
+            if pb:
+                pred = frozen_func(x=tf.constant(imn)).numpy()
+            elif saved_model:
+                pred = model(imn, training=False).numpy()
+            elif tflite:
+                if tfl_int8:
+                    scale, zero_point = input_details[0]['quantization']
+                    imn = (imn / scale + zero_point).astype(np.uint8)
+                interpreter.set_tensor(input_details[0]['index'], imn)
+                interpreter.invoke()
+                pred = interpreter.get_tensor(output_details[0]['index'])
+                if tfl_int8:
+                    scale, zero_point = output_details[0]['quantization']
+                    pred = (pred.astype(np.float32) - zero_point) * scale
+            pred[..., 0] *= imgsz[1]  # x
+            pred[..., 1] *= imgsz[0]  # y
+            pred[..., 2] *= imgsz[1]  # w
+            pred[..., 3] *= imgsz[0]  # h
+            pred = torch.tensor(pred)
 
         # NMS
         pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
@@ -202,9 +244,9 @@ def run(weights='yolov5s.pt',  # model.pt path(s)
 
 def parse_opt():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)')
+    parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pb', help='model.pt path(s)')
     parser.add_argument('--source', type=str, default='data/images', help='file/dir/URL/glob, 0 for webcam')
-    parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)')
+    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w')
     parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')
     parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
     parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
@@ -226,7 +268,9 @@ def parse_opt():
     parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')
     parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
     parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
+    parser.add_argument('--tfl-int8', action='store_true', help='INT8 quantized TFLite model')
     opt = parser.parse_args()
+    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
     return opt
 
 

diff --git a/models/experimental.py b/models/experimental.py
@@ -85,14 +85,18 @@ def forward(self, x, augment=False, profile=False, visualize=False):
         return y, None  # inference, train output
 
 
-def attempt_load(weights, map_location=None, inplace=True):
+def attempt_load(weights, map_location=None, inplace=True, fuse=True):
     from models.yolo import Detect, Model
 
     # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
     model = Ensemble()
     for w in weights if isinstance(weights, list) else [weights]:
         ckpt = torch.load(attempt_download(w), map_location=map_location)  # load
-        model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().fuse().eval())  # FP32 model
+        if fuse:
+            model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().fuse().eval())  # FP32 model
+        else:
+            model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().eval())  # without layer fuse
+
 
     # Compatibility updates
     for m in model.modules():