feat: full cuda pipeline for tensorrt

jolibrain · Oct 13, 2021 · 93815d7 · 93815d7
1 parent 523e528
commit 93815d7
Show file tree

Hide file tree

Showing 17 changed files with 604 additions and 207 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -573,24 +573,30 @@ if (USE_TF)
 endif() # USE_TF
 
 # OpenCV
+set(OPENCV_MODULES core imgproc highgui imgcodecs)
+if (USE_CUDA_CV)
+  list(APPEND OPENCV_MODULES cudaimgproc cudaarithm cudawarping)
+endif()
+
 if (USE_OPENCV_VERSION STREQUAL "")
-  find_package(OpenCV 3 QUIET COMPONENTS core imgproc highgui imgcodecs)
+  find_package(OpenCV 3 QUIET COMPONENTS ${OPENCV_MODULES})
   if (NOT OpenCV_FOUND)
-    find_package(OpenCV 4 QUIET COMPONENTS core imgproc highgui imgcodecs)
+    find_package(OpenCV 4 QUIET COMPONENTS ${OPENCV_MODULES})
   endif()
   if (NOT OpenCV_FOUND)
-    find_package(OpenCV 2 REQUIRED COMPONENTS core imgproc highgui imgcodecs)
+    find_package(OpenCV 2 REQUIRED COMPONENTS ${OPENCV_MODULES})
   endif()
 else()
-  find_package(OpenCV ${USE_OPENCV_VERSION} REQUIRED COMPONENTS core imgproc highgui imgcodecs)
+  find_package(OpenCV ${USE_OPENCV_VERSION} REQUIRED COMPONENTS ${OPENCV_MODULES})
 endif()
+
 set(OPENCV_VERSION ${OpenCV_VERSION_MAJOR})
 include_directories(${OpenCV_INCLUDE_DIRS})
 message(STATUS "OpenCV ${OPENCV_VERSION} (${OpenCV_VERSION}) found (${OpenCV_CONFIG_PATH})")
 
 if (USE_CUDA_CV)
   message(STATUS "Using CUDA OpenCV")
-  string(APPEND CMAKE_CXX_FLAGS " -DUSE_CUDA_CV")
+  add_definitions(-DUSE_CUDA_CV)
 endif()
 
 # customized Caffe as external project
@@ -1257,3 +1263,5 @@ message(STATUS "USE_XGBOOST:           ${USE_XGBOOST}")
 message(STATUS "USE_XGBOOST_CPU_ONLY:  ${USE_XGBOOST_CPU_ONLY}")
 message(STATUS "USE_TSNE:              ${USE_TSNE}")
 message(STATUS "USE_BOOST_BACKTRACE:   ${USE_BOOST_BACKTRACE}")
+message(STATUS "USE_CUDA_CV:           ${USE_CUDA_CV}")
+message(STATUS "OPENCV_VERSION:        ${OPENCV_VERSION}")
diff --git a/ci/Jenkinsfile-jetson-nano.unittests b/ci/Jenkinsfile-jetson-nano.unittests
@@ -64,7 +64,7 @@ ccache -s
         script {
           docker.image(env.DOCKER_IMAGE).inside(env.DOCKER_PARAMS) {
             sh '''
-            cd build && ctest -V -E "multigpu"
+            cd build && ctest -V -E "multigpu|python" 
             '''
           }
         }

diff --git a/ci/Jenkinsfile-trt.unittests b/ci/Jenkinsfile-trt.unittests
@@ -38,6 +38,7 @@ cmake .. \
     -DBUILD_TESTS=ON  \
     -DBUILD_SPDLOG=ON \
     -DUSE_HTTP_SERVER_OATPP=ON \
+    -DUSE_CUDA_CV=ON  \
     -DUSE_CAFFE=OFF  \
     -DUSE_TENSORRT=ON  \
     -DUSE_TENSORRT_OSS=ON  \
@@ -68,7 +69,7 @@ ccache -s
               python3 -c 'import torch, sys; c=torch.cuda.device_count() ; print(f"CUDA VISIBLE GPU: {c}"); sys.exit(bool(c == 0 ))'
               echo
               echo "****************************"
-              cd build && ctest -V -E "multigpu"
+              cd build && ctest -V -E "multigpu|python"
               '''
             }
           }

diff --git a/ci/devel-trt.Dockerfile b/ci/devel-trt.Dockerfile
@@ -112,7 +112,7 @@ RUN for url in \
         ; do curl -L -s -o /tmp/p.deb $url && dpkg -i /tmp/p.deb && rm -rf /tmp/p.deb; done
 
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install torch torchvision
+RUN python3 -m pip install torch
 
 # Build OpenCV 4 with CUDA
 WORKDIR /tmp

diff --git a/src/apidata.cc b/src/apidata.cc
@@ -95,6 +95,14 @@ namespace dd
     return vout();
   }
 
+#ifdef USE_CUDA_CV
+  vout visitor_vad::operator()(const std::vector<cv::cuda::GpuMat> &vcv)
+  {
+    (void)vcv;
+    return vout();
+  }
+#endif
+
   vout visitor_vad::operator()(const std::vector<std::pair<int, int>> &vpi)
   {
     (void)vpi;

diff --git a/src/apidata.h b/src/apidata.h
@@ -33,6 +33,9 @@
 #include <rapidjson/writer.h>
 #pragma GCC diagnostic pop
 #include <opencv2/core/core.hpp>
+#ifdef USE_CUDA_CV
+#include <opencv2/cudaimgproc.hpp>
+#endif
 #include "dd_types.h"
 #include <unordered_map>
 #include <vector>
@@ -52,6 +55,9 @@ namespace dd
       std::string, double, int, long int, long long int, bool,
       std::vector<std::string>, std::vector<double>, std::vector<int>,
       std::vector<bool>, std::vector<cv::Mat>,
+#ifdef USE_CUDA_CV
+      std::vector<cv::cuda::GpuMat>,
+#endif
       std::vector<std::pair<int, int>>,
       mapbox::util::recursive_wrapper<APIData>,
       mapbox::util::recursive_wrapper<std::vector<APIData>>, oatpp::Any>
@@ -122,6 +128,9 @@ namespace dd
     vout operator()(const std::vector<bool> &vd);
     vout operator()(const std::vector<std::string> &vs);
     vout operator()(const std::vector<cv::Mat> &vcv);
+#ifdef USE_CUDA_CV
+    vout operator()(const std::vector<cv::cuda::GpuMat> &vcv);
+#endif
     vout operator()(const std::vector<std::pair<int, int>> &vpi);
     vout operator()(const APIData &ad);
     vout operator()(const std::vector<APIData> &vad);
@@ -495,6 +504,13 @@ namespace dd
       // automatically validated.
     }
 
+#ifdef USE_CUDA_CV
+    void operator()(const std::vector<cv::cuda::GpuMat> &vcv)
+    {
+      (void)vcv;
+    }
+#endif
+
     void operator()(const std::vector<std::pair<int, int>> &vpi)
     {
       (void)vpi;

diff --git a/src/backends/tensorrt/tensorrtinputconns.cc b/src/backends/tensorrt/tensorrtinputconns.cc
@@ -23,6 +23,9 @@
 #include <opencv2/highgui/highgui.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
 #include <boost/math/cstdfloat/cstdfloat_types.hpp>
+#ifdef USE_CUDA_CV
+#include <opencv2/core/cuda_stream_accessor.hpp>
+#endif
 
 namespace dd
 {
@@ -84,6 +87,56 @@ namespace dd
       }
   }
 
+#ifdef USE_CUDA_CV
+  void ImgTensorRTInputFileConn::GpuMatToRTBuffer(cv::cuda::GpuMat &img, int i)
+  {
+    cv::cuda::GpuMat converted;
+    int channels = img.channels();
+
+    if (_cuda_buf == nullptr)
+      throw InputConnectorInternalException(
+          "No cuda buffer available to copy the data");
+    if (_has_mean_scalar && _mean.size() != size_t(channels))
+      throw InputConnectorBadParamException(
+          "mean vector be of size the number of channels ("
+          + std::to_string(channels) + ")");
+
+    if (!_std.empty() && _std.size() != size_t(channels))
+      throw InputConnectorBadParamException(
+          "std vector be of size the number of channels ("
+          + std::to_string(channels) + ")");
+
+    bool has_std = !_std.empty();
+
+    // TODO use stream for asynchronous version
+    // TODO Maybe preallocation too?
+    img.convertTo(converted, CV_32F);
+
+    std::vector<cv::cuda::GpuMat> vec_channels;
+    cv::cuda::split(converted, vec_channels, *_cuda_stream);
+
+    for (int c = 0; c < channels; ++c)
+      {
+        auto &channel = vec_channels.at(c);
+        cv::cuda::multiply(channel, _scale, channel, 1, -1, *_cuda_stream);
+
+        if (_has_mean_scalar)
+          cv::cuda::add(channel, -_mean[c], channel, cv::noArray(), -1,
+                        *_cuda_stream);
+        if (has_std)
+          cv::cuda::multiply(channel, 1.0 / _std[c], channel, 1, -1,
+                             *_cuda_stream);
+
+        int offset = _height * _width * (i * channels + c);
+        cudaMemcpy2DAsync(_cuda_buf + offset, _width * sizeof(float),
+                          channel.ptr<float>(), channel.step,
+                          _width * sizeof(float), _height,
+                          cudaMemcpyDeviceToDevice,
+                          cv::cuda::StreamAccessor::getStream(*_cuda_stream));
+      }
+  }
+#endif
+
   void ImgTensorRTInputFileConn::transform(
       oatpp::Object<DTO::ServicePredict> input_dto)
   {
@@ -103,8 +156,13 @@ namespace dd
     bool set_ids = false;
     if (this->_ids.empty())
       set_ids = true;
+    size_t img_count =
+#ifdef USE_CUDA_CV
+        _cuda ? this->_cuda_images.size() :
+#endif
+              this->_images.size();
 
-    for (int i = 0; i < (int)this->_images.size(); i++)
+    for (size_t i = 0; i < img_count; i++)
       {
         if (set_ids)
           this->_ids.push_back(this->_uris.at(i));
@@ -125,13 +183,29 @@ namespace dd
       _buf.resize(batch_size * height() * width());
     else
       _buf.resize(batch_size * 3 * height() * width());
-    for (i = 0; i < batch_size && _batch_index < (int)this->_images.size();
+
+    size_t img_count =
+#ifdef USE_CUDA_CV
+        _cuda ? this->_cuda_images.size() :
+#endif
+              this->_images.size();
+
+    for (i = 0; i < batch_size && _batch_index < (int)img_count;
          i++, _batch_index++)
       {
-        cv::Mat img = this->_images.at(_batch_index);
-        CVMatToRTBuffer(img, i);
+#ifdef USE_CUDA_CV
+        if (_cuda)
+          {
+            cv::cuda::GpuMat img = this->_cuda_images.at(_batch_index);
+            GpuMatToRTBuffer(img, i);
+          }
+        else
+#endif
+          {
+            cv::Mat img = this->_images.at(_batch_index);
+            CVMatToRTBuffer(img, i);
+          }
       }
     return i;
   }
-
 }
diff --git a/src/backends/tensorrt/tensorrtinputconns.h b/src/backends/tensorrt/tensorrtinputconns.h
@@ -44,6 +44,7 @@ namespace dd
 
     bool _has_mean_file = false; /**< image model mean.binaryproto. */
     std::vector<float> _buf;
+    float *_cuda_buf = nullptr;
 
     float *data()
     {
@@ -87,16 +88,21 @@ namespace dd
 
     void transform(oatpp::Object<DTO::ServicePredict> input_dto);
 
+    int process_batch(const unsigned int batch_size);
+
     std::string _meanfname = "mean.binaryproto";
     std::string _correspname = "corresp.txt";
     int _batch_index = 0;
     int _batch_size = 0;
-    int process_batch(const unsigned int batch_size);
     std::unordered_map<std::string, std::pair<int, int>>
         _imgs_size; /**< image sizes, used in detection. */
 
   private:
     void CVMatToRTBuffer(cv::Mat &img, int i);
+
+#ifdef USE_CUDA_CV
+    void GpuMatToRTBuffer(cv::cuda::GpuMat &img, int i);
+#endif
   };
 
 }

diff --git a/src/backends/tensorrt/tensorrtlib.cc b/src/backends/tensorrt/tensorrtlib.cc
@@ -29,6 +29,9 @@
 #include <cuda_runtime_api.h>
 #include <string>
 #include "dto/service_predict.hpp"
+#ifdef USE_CUDA_CV
+#include <opencv2/core/cuda_stream_accessor.hpp>
+#endif
 
 namespace dd
 {
@@ -701,8 +704,18 @@ namespace dd
           }
       }
 
+    cudaSetDevice(_gpuid);
+    cudaStream_t cstream;
+    cudaStreamCreate(&cstream);
+
     TOutputConnectorStrategy tout(this->_outputc);
     this->_stats.transform_start();
+#ifdef USE_CUDA_CV
+    inputc._cuda_buf = static_cast<float *>(_buffers.data()[_inputIndex]);
+    auto cv_stream = cv::cuda::StreamAccessor::wrapStream(cstream);
+    inputc._cuda_stream = &cv_stream;
+#endif
+
     try
       {
         inputc.transform(predict_dto);
@@ -730,10 +743,6 @@ namespace dd
     std::vector<APIData> vrad;
     std::vector<UnsupervisedResult> unsup_results;
 
-    cudaSetDevice(_gpuid);
-    cudaStream_t cstream;
-    cudaStreamCreate(&cstream);
-
     bool enqueue_success = false;
     while (true)
       {
@@ -744,16 +753,22 @@ namespace dd
 
         try
           {
-            if (inputc._bw)
-              cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
-                              num_processed * inputc._height * inputc._width
-                                  * sizeof(float),
-                              cudaMemcpyHostToDevice, cstream);
-            else
-              cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
-                              num_processed * 3 * inputc._height
-                                  * inputc._width * sizeof(float),
-                              cudaMemcpyHostToDevice, cstream);
+#ifdef USE_CUDA_CV
+            if (!inputc._cuda)
+#endif
+              {
+                if (inputc._bw)
+                  cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
+                                  num_processed * inputc._height
+                                      * inputc._width * sizeof(float),
+                                  cudaMemcpyHostToDevice, cstream);
+                else
+                  cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
+                                  num_processed * 3 * inputc._height
+                                      * inputc._width * sizeof(float),
+                                  cudaMemcpyHostToDevice, cstream);
+              }
+
             if (!_explicit_batch)
               enqueue_success = _context->enqueue(
                   num_processed, _buffers.data(), cstream, nullptr);
@@ -1036,20 +1051,26 @@ namespace dd
       {
         if (typeid(inputc) == typeid(ImgTensorRTInputFileConn))
           {
+            auto *img_ic
+                = reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc);
             APIData chain_input;
-            if (!reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
-                     ->_orig_images.empty())
-              chain_input.add(
-                  "imgs", reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
-                              ->_orig_images);
+#ifdef USE_CUDA_CV
+            if (!img_ic->_cuda_images.empty())
+              {
+                if (img_ic->_orig_images.empty())
+                  chain_input.add("cuda_imgs", img_ic->_cuda_orig_images);
+                else
+                  chain_input.add("cuda_imgs", img_ic->_cuda_images);
+              }
             else
-              chain_input.add(
-                  "imgs", reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
-                              ->_images);
-            chain_input.add(
-                "imgs_size",
-                reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
-                    ->_images_size);
+#endif
+              {
+                if (!img_ic->_orig_images.empty())
+                  chain_input.add("imgs", img_ic->_orig_images);
+                else
+                  chain_input.add("imgs", img_ic->_images);
+              }
+            chain_input.add("imgs_size", img_ic->_images_size);
             out.add("input", chain_input);
           }
       }