feat(trt): add int8 inference

jolibrain · Jan 14, 2023 · a212a8e · a212a8e
1 parent 5a02234
commit a212a8e
Show file tree

Hide file tree

Showing 6 changed files with 297 additions and 19 deletions.
diff --git a/src/backends/tensorrt/tensorrtcalibrator.hpp b/src/backends/tensorrt/tensorrtcalibrator.hpp
@@ -0,0 +1,153 @@
+/**
+ * DeepDetect
+ * Copyright (c) 2023 Jolibrain
+ * Author: Louis Jean <louis.jean@jolibrain.com>
+ *
+ * This file is part of deepdetect.
+ *
+ * deepdetect is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * deepdetect is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with deepdetect.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TENSORRTCALIBRATOR_H
+#define TENSORRTCALIBRATOR_H
+
+#include <fstream>
+
+#include "mllibstrategy.h"
+#include "NvInfer.h"
+
+namespace dd
+{
+  template <typename TConnector>
+  class TRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2
+  {
+  public:
+    TRTCalibrator(TConnector *connector, const std::string &model_repo,
+                  int max_batch_size, bool use_cache,
+                  std::shared_ptr<spdlog::logger> logger)
+        : _conn{ connector }, _logger(logger), _max_batch_size(max_batch_size),
+          _use_cache(use_cache),
+          _calibration_table_path(model_repo + "/calibration_table")
+    {
+      if (!use_cache)
+        {
+          // XXX(louis): works only for images
+          if (_conn->_bw)
+            _input_size = _conn->_height * _conn->_width;
+          else
+            _input_size = _conn->_height * _conn->_width * 3;
+
+          auto result = cudaMalloc(&_input_buf, _max_batch_size * _input_size
+                                                    * sizeof(float));
+
+          if (result)
+            throw MLLibInternalException(
+                "Could not allocate input buffer for "
+                "model calibration (size="
+                + std::to_string(_input_size * _max_batch_size) + ")");
+#ifdef USE_CUDA_CV
+          _conn->_cuda_buf = static_cast<float *>(_input_buf);
+#endif
+        }
+    }
+
+    virtual ~TRTCalibrator()
+    {
+      cudaFree(_input_buf);
+    }
+
+    int getBatchSize() const noexcept override
+    {
+      return _max_batch_size;
+    }
+
+    /** Returns next batch from input connector */
+    bool getBatch(void *bindings[], const char *names[],
+                  int nbBindings) noexcept override
+    {
+      // only one binding
+      (void)names;
+      (void)nbBindings;
+
+      if (_use_cache)
+        return false;
+
+      int num_processed = _conn->process_batch(_max_batch_size);
+      if (num_processed == 0)
+        return false;
+#ifdef USE_CUDA_CV
+      if (!_conn->_cuda)
+#endif
+        {
+          bool result
+              = cudaMemcpyAsync(_input_buf, _conn->data(),
+                                num_processed * _input_size * sizeof(float),
+                                cudaMemcpyHostToDevice);
+          if (result)
+            return false;
+        }
+      bindings[0] = _input_buf;
+      return true;
+    }
+
+    /** read calibration table from disk */
+    const void *readCalibrationCache(size_t &length) noexcept override
+    {
+      if (!_use_cache)
+        return nullptr;
+
+      _calibration_cache.clear();
+      _logger->info("reading cache at {}", _calibration_table_path);
+      std::ifstream input(_calibration_table_path, std::ios::binary);
+      input >> std::noskipws;
+      if (input.good())
+        {
+          // TODO logger
+          std::copy(std::istream_iterator<char>(input),
+                    std::istream_iterator<char>(),
+                    std::back_inserter(_calibration_cache));
+        }
+      else
+        {
+          _logger->error(
+              "No int8 calibration data found, please run a calibration "
+              "inference with mllib.calibration = true");
+        }
+      length = _calibration_cache.size();
+      return _calibration_cache.data();
+    }
+
+    /** write calibration table to disk */
+    void writeCalibrationCache(const void *cache,
+                               size_t length) noexcept override
+    {
+      std::ofstream output(_calibration_table_path, std::ios::binary);
+      output.write(reinterpret_cast<const char *>(cache), length);
+    }
+
+  private:
+    TConnector *_conn;
+    std::shared_ptr<spdlog::logger> _logger;
+
+    int _max_batch_size;
+    /// input size (not batched)
+    int _input_size;
+    void *_input_buf = nullptr;
+    bool _use_cache{ true };
+    std::string _calibration_table_path;
+    std::vector<char> _calibration_cache;
+  };
+}
+
+#endif
diff --git a/src/backends/tensorrt/tensorrtinputconns.h b/src/backends/tensorrt/tensorrtinputconns.h
@@ -28,6 +28,7 @@
 
 namespace dd
 {
+
   class TensorRTInputInterface
   {
   public:
@@ -104,7 +105,6 @@ namespace dd
     void GpuMatToRTBuffer(cv::cuda::GpuMat &img, int i);
 #endif
   };
-
 }
 
 #endif
diff --git a/src/backends/tensorrt/tensorrtlib.cc b/src/backends/tensorrt/tensorrtlib.cc
@@ -22,6 +22,7 @@
 #include "tensorrtlib.h"
 #include "utils/apitools.h"
 #include "tensorrtinputconns.h"
+#include "tensorrtcalibrator.hpp"
 #include "NvInferPlugin.h"
 #include "../parsers/onnx/NvOnnxParser.h"
 #include "protoUtils.h"
@@ -39,8 +40,26 @@ namespace dd
 
   static TRTLogger trtLogger;
 
+  static std::string dtype_to_str(nvinfer1::DataType dtype)
+  {
+    switch (dtype)
+      {
+      case nvinfer1::DataType::kFLOAT:
+        return "fp32";
+      case nvinfer1::DataType::kHALF:
+        return "fp16";
+      case nvinfer1::DataType::kINT32:
+        return "int32";
+      case nvinfer1::DataType::kINT8:
+        return "int8";
+      default:
+        throw MLLibInternalException("Unsupported datatype: "
+                                     + std::to_string(int(dtype)));
+      }
+  }
+
   static int findEngineBS(std::string repo, std::string engineFileName,
-                          std::string arch)
+                          std::string arch, nvinfer1::DataType dtype)
   {
     std::unordered_set<std::string> lfiles;
     fileops::list_directory(repo, true, false, false, lfiles);
@@ -51,7 +70,8 @@ namespace dd
         if (fstart == std::string::npos)
           fstart = 0;
 
-        if (s.find(engineFileName + "_arch" + arch, fstart)
+        if (s.find(engineFileName + "_arch" + arch + "_" + dtype_to_str(dtype),
+                   fstart)
             != std::string::npos)
           {
             std::string bs_str;
@@ -320,7 +340,7 @@ namespace dd
 
     // remove compiled model files.
     std::vector<std::string> extensions
-        = { "TRTengine", "net_tensorRT.proto" };
+        = { "TRTengine", "net_tensorRT.proto", "calibration_table" };
     fileops::remove_directory_files(this->_mlmodel._repo, extensions);
   }
 
@@ -385,6 +405,8 @@ namespace dd
 
     _builderc->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE,
                                   _max_workspace_size);
+    if (_calibrator)
+      _builderc->setInt8Calibrator(_calibrator.get());
 
     network->getLayer(0)->setPrecision(nvinfer1::DataType::kFLOAT);
 
@@ -436,6 +458,10 @@ namespace dd
             "Error while parsing onnx model for conversion to "
             "TensorRT");
       }
+
+    if (_calibrator)
+      _builderc->setInt8Calibrator(_calibrator.get());
+
     // TODO check with onnx models dynamic shape
     this->_logger->warn("Onnx model: max batch size not used");
     _builderc->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE,
@@ -503,6 +529,7 @@ namespace dd
 
     std::string out_blob = "prob";
     std::string extract_layer = predict_dto->parameters->mllib->extract_layer;
+    bool calibration = predict_dto->parameters->mllib->calibration;
 
     TInputConnectorStrategy inputc(this->_inputc);
 
@@ -571,13 +598,14 @@ namespace dd
 
         bool engineRead = false;
         std::string engine_path = this->_mlmodel._repo + "/" + _engineFileName
-                                  + "_arch" + _arch + "_bs"
+                                  + "_arch" + _arch + "_"
+                                  + dtype_to_str(_datatype) + "_bs"
                                   + std::to_string(_max_batch_size);
 
         if (_readEngine)
           {
-            int bs
-                = findEngineBS(this->_mlmodel._repo, _engineFileName, _arch);
+            int bs = findEngineBS(this->_mlmodel._repo, _engineFileName, _arch,
+                                  _datatype);
             if (bs != _max_batch_size && bs != -1)
               {
                 throw MLLibBadParamException(
@@ -638,6 +666,31 @@ namespace dd
         if (!engineRead)
           {
             nvinfer1::ICudaEngine *le = nullptr;
+            if (_datatype == nvinfer1::DataType::kINT8)
+              {
+                if (calibration)
+                  {
+                    try
+                      {
+                        inputc.transform(predict_dto);
+                      }
+                    catch (...)
+                      {
+                        throw;
+                      }
+                  }
+
+                bool calibrate_from_cache = !calibration;
+                if (calibrate_from_cache)
+                  this->_logger->info(
+                      "Setting up the int8 calibrator using cache");
+                else
+                  this->_logger->info(
+                      "Setting up the int8 calibrator using test data");
+                _calibrator.reset(new TRTCalibrator<TInputConnectorStrategy>(
+                    &inputc, this->_mlmodel._repo, _max_batch_size,
+                    calibrate_from_cache, this->_logger));
+              }
 
             if (this->_mlmodel._model.find("net_tensorRT.proto")
                     != std::string::npos
@@ -664,6 +717,9 @@ namespace dd
                 p.write(reinterpret_cast<const char *>(trtModelStream->data()),
                         trtModelStream->size());
               }
+
+            // once the engine is built, calibrator is not needed anymore
+            _calibrator = nullptr;
           }
         else
           {
@@ -782,7 +838,7 @@ namespace dd
     TOutputConnectorStrategy tout(this->_outputc);
     this->_stats.transform_start();
 #ifdef USE_CUDA_CV
-    inputc._cuda_buf = static_cast<float *>(_buffers.data()[_inputIndex]);
+    inputc._cuda_buf = static_cast<float *>(_buffers.at(_inputIndex));
     auto cv_stream = cv::cuda::StreamAccessor::wrapStream(cstream);
     inputc._cuda_stream = &cv_stream;
 #endif

diff --git a/src/backends/tensorrt/tensorrtlib.h b/src/backends/tensorrt/tensorrtlib.h
@@ -127,6 +127,7 @@ namespace dd
         _template; /**< template for models that require specific treatment */
 
     //!< The TensorRT engine used to run the network
+    std::shared_ptr<nvinfer1::IInt8Calibrator> _calibrator = nullptr;
     std::shared_ptr<nvinfer1::ICudaEngine> _engine = nullptr;
     std::shared_ptr<nvinfer1::IBuilder> _builder = nullptr;
     std::shared_ptr<nvinfer1::IExecutionContext> _context = nullptr;

diff --git a/src/dto/mllib.hpp b/src/dto/mllib.hpp
@@ -192,6 +192,15 @@ namespace dd
       }
       DTO_FIELD(String, forward_method) = "";
 
+      // =====
+      // TensorRT Options
+      DTO_FIELD_INFO(calibration)
+      {
+        info->description
+            = "whether this image should be used to calibrate the model";
+      };
+      DTO_FIELD(Boolean, calibration) = false;
+
       // =====
       // NCNN Options
       DTO_FIELD_INFO(threads)