ultralytics · triple-Mu · May 9, 2022 · May 9, 2022 · May 9, 2022 · May 9, 2022
diff --git a/export.py b/export.py
@@ -214,7 +214,19 @@ def export_coreml(model, im, file, int8, half, prefix=colorstr('CoreML:')):
         return None, None
 
 
-def export_engine(model, im, file, train, half, simplify, workspace=4, verbose=False, prefix=colorstr('TensorRT:')):
+def export_engine(model,
+                  im,
+                  file,
+                  train,
+                  half,
+                  simplify,
+                  nms=False,
+                  topk_all=None,
+                  iou_thres=None,
+                  conf_thres=None,
+                  workspace=4,
+                  verbose=False,
+                  prefix=colorstr('TensorRT:')):
     # YOLOv5 TensorRT export https://developer.nvidia.com/tensorrt
     try:
         assert im.device.type != 'cpu', 'export running on CPU but must be on GPU, i.e. `python export.py --device 0`'
@@ -233,6 +245,11 @@ def export_engine(model, im, file, train, half, simplify, workspace=4, verbose=F
         else:  # TensorRT >= 8
             check_version(trt.__version__, '8.0.0', hard=True)  # require tensorrt>=8.0.0
             export_onnx(model, im, file, 13, train, False, simplify)  # opset 13
+            if nms:
+                from utils.general import nmsRegister
+                export_onnx(model, im, file, 12, train, False, simplify)
+                LOGGER.info(f'\n{prefix} EfficientNMS plugin only supprot TensorRT greater than 8.0.0 ...')
+                file = nmsRegister(file, train, topk_all=topk_all, iou_thres=iou_thres, conf_thres=conf_thres)
         onnx = file.with_suffix('.onnx')
 
         LOGGER.info(f'\n{prefix} starting export with TensorRT {trt.__version__}...')
@@ -261,8 +278,8 @@ def export_engine(model, im, file, train, half, simplify, workspace=4, verbose=F
         for out in outputs:
             LOGGER.info(f'{prefix}\toutput "{out.name}" with shape {out.shape} and dtype {out.dtype}')
 
-        LOGGER.info(f'{prefix} building FP{16 if builder.platform_has_fast_fp16 else 32} engine in {f}')
-        if builder.platform_has_fast_fp16:
+        LOGGER.info(f'{prefix} building FP{16 if half and builder.platform_has_fast_fp16 else 32} engine in {f}')
+        if half and builder.platform_has_fast_fp16:
             config.set_flag(trt.BuilderFlag.FP16)
         with builder.build_engine(network, config) as engine, open(f, 'wb') as t:
             t.write(engine.serialize())
@@ -515,7 +532,8 @@ def run(
     if jit:
         f[0] = export_torchscript(model, im, file, optimize)
     if engine:  # TensorRT required before ONNX
-        f[1] = export_engine(model, im, file, train, half, simplify, workspace, verbose)
+        f[1] = export_engine(model, im, file, train, half, simplify, nms, topk_all, iou_thres, conf_thres, workspace,
+                             verbose)
     if onnx or xml:  # OpenVINO requires ONNX
         f[2] = export_onnx(model, im, file, opset, train, dynamic, simplify)
     if xml:  # OpenVINO

diff --git a/models/common.py b/models/common.py
@@ -719,3 +719,20 @@ def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, str
     def forward(self, x):
         z = torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1)  # cat if list
         return self.flat(self.conv(z))  # flatten to x(b,c2)
+
+
+class AdditionNet(nn.Module):
+    # AdditionNet preprocess for registering EfficientNMS into TensorRT engine
+    # Only support static inputs
+    def __init__(self):
+        super().__init__()
+        self.convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
+                                           dtype=torch.float32)
+
+    def forward(self, x):
+        box = x[:, :, :4]
+        conf = x[:, :, 4:5]
+        score = x[:, :, 5:]
+        score *= conf
+        box @= self.convert_matrix
+        return box, score
diff --git a/test-nms.ipynb b/test-nms.ipynb
diff --git a/utils/general.py b/utils/general.py
@@ -14,6 +14,7 @@
 import re
 import shutil
 import signal
+import tempfile
 import threading
 import time
 import urllib
@@ -989,3 +990,68 @@ def imshow(path, im):
 
 # Variables ------------------------------------------------------------------------------------------------------------
 NCOLS = 0 if is_docker() else shutil.get_terminal_size().columns  # terminal window size for tqdm
+
+
+# onnx-graphsurgeon  ---------------------------------------------------------------------------------------------------
+def nmsRegister(file, train, topk_all=100, iou_thres=0.45, conf_thres=0.25):
+    from models.common import AdditionNet
+    try:
+        import onnx_graphsurgeon as gs
+    except Exception:
+        if platform.system() == 'Linux':
+            check_requirements(('onnx-graphsurgeon',), cmds=('-U --index-url https://pypi.ngc.nvidia.com',))
+        import onnx_graphsurgeon as gs
+
+    check_requirements(('onnx',))
+    import onnx
+    gs_graph = gs.import_onnx(onnx.load(file.with_suffix('.onnx')))
+    additionNet = AdditionNet()
+    tmpFile = tempfile.NamedTemporaryFile(suffix=".onnx")
+    torch.onnx.export(additionNet,
+                      torch.randn(gs_graph.outputs[0].shape),
+                      tmpFile.name,
+                      verbose=False,
+                      opset_version=13,
+                      training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
+                      do_constant_folding=not train,
+                      input_names=["tmpInput"],
+                      output_names=["boxes", "scores"],
+                      dynamic_axes=None)
+    tmp_graph = gs.import_onnx(onnx.load(tmpFile.name))
+    for node in tmp_graph.nodes:
+        node.name = "Addition_" + node.name
+        gs_graph.nodes.append(node)
+        try:
+            name = node.inputs[0].name
+        except Exception:
+            pass
+        else:
+            if name == "tmpInput":
+                node.inputs[0] = gs_graph.outputs[0]
+    gs_graph.outputs = tmp_graph.outputs
+    gs_graph.cleanup().toposort()
+    op_inputs = gs_graph.outputs
+    if not (topk_all and conf_thres and iou_thres):
+        topk_all, conf_thres, iou_thres = 100, 0.25, 0.45
+    op = "EfficientNMS_TRT"
+    attrs = {
+        "plugin_version": "1",
+        "background_class": -1,  # no background class
+        "max_output_boxes": topk_all,
+        "score_threshold": conf_thres,
+        "iou_threshold": iou_thres,
+        "score_activation": False,
+        "box_coding": 0,}
+    # NMS Outputs
+    output_num_detections = gs.Variable(name="num_detections", dtype=np.int32,
+                                        shape=[gs_graph.outputs[0].shape[0], 1
+                                               ])  # A scalar indicating the number of valid detections per batch image.
+    output_boxes = gs.Variable(name="detection_boxes", dtype=np.float32, shape=[gs_graph.outputs[0].shape[0], 100, 4])
+    output_scores = gs.Variable(name="detection_scores", dtype=np.float32, shape=[gs_graph.outputs[0].shape[0], 100])
+    output_labels = gs.Variable(name="detection_classes", dtype=np.int32, shape=[gs_graph.outputs[0].shape[0], 100])
+    op_outputs = [output_num_detections, output_boxes, output_scores, output_labels]
+    gs_graph.layer(op=op, name="batched_nms", inputs=op_inputs, outputs=op_outputs, attrs=attrs)
+    gs_graph.outputs = op_outputs
+    gs_graph.cleanup().toposort()
+    onnx.save(gs.export_onnx(gs_graph), str(file.with_suffix('.onnx')))
+    return file