Skip to content

Commit

Permalink
feat: full cuda pipeline for tensorrt
Browse files Browse the repository at this point in the history
  • Loading branch information
Bycob authored and mergify[bot] committed Oct 13, 2021
1 parent 523e528 commit 93815d7
Show file tree
Hide file tree
Showing 17 changed files with 604 additions and 207 deletions.
18 changes: 13 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -573,24 +573,30 @@ if (USE_TF)
endif() # USE_TF

# OpenCV
set(OPENCV_MODULES core imgproc highgui imgcodecs)
if (USE_CUDA_CV)
list(APPEND OPENCV_MODULES cudaimgproc cudaarithm cudawarping)
endif()

if (USE_OPENCV_VERSION STREQUAL "")
find_package(OpenCV 3 QUIET COMPONENTS core imgproc highgui imgcodecs)
find_package(OpenCV 3 QUIET COMPONENTS ${OPENCV_MODULES})
if (NOT OpenCV_FOUND)
find_package(OpenCV 4 QUIET COMPONENTS core imgproc highgui imgcodecs)
find_package(OpenCV 4 QUIET COMPONENTS ${OPENCV_MODULES})
endif()
if (NOT OpenCV_FOUND)
find_package(OpenCV 2 REQUIRED COMPONENTS core imgproc highgui imgcodecs)
find_package(OpenCV 2 REQUIRED COMPONENTS ${OPENCV_MODULES})
endif()
else()
find_package(OpenCV ${USE_OPENCV_VERSION} REQUIRED COMPONENTS core imgproc highgui imgcodecs)
find_package(OpenCV ${USE_OPENCV_VERSION} REQUIRED COMPONENTS ${OPENCV_MODULES})
endif()

set(OPENCV_VERSION ${OpenCV_VERSION_MAJOR})
include_directories(${OpenCV_INCLUDE_DIRS})
message(STATUS "OpenCV ${OPENCV_VERSION} (${OpenCV_VERSION}) found (${OpenCV_CONFIG_PATH})")

if (USE_CUDA_CV)
message(STATUS "Using CUDA OpenCV")
string(APPEND CMAKE_CXX_FLAGS " -DUSE_CUDA_CV")
add_definitions(-DUSE_CUDA_CV)
endif()

# customized Caffe as external project
Expand Down Expand Up @@ -1257,3 +1263,5 @@ message(STATUS "USE_XGBOOST: ${USE_XGBOOST}")
message(STATUS "USE_XGBOOST_CPU_ONLY: ${USE_XGBOOST_CPU_ONLY}")
message(STATUS "USE_TSNE: ${USE_TSNE}")
message(STATUS "USE_BOOST_BACKTRACE: ${USE_BOOST_BACKTRACE}")
message(STATUS "USE_CUDA_CV: ${USE_CUDA_CV}")
message(STATUS "OPENCV_VERSION: ${OPENCV_VERSION}")
2 changes: 1 addition & 1 deletion ci/Jenkinsfile-jetson-nano.unittests
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ ccache -s
script {
docker.image(env.DOCKER_IMAGE).inside(env.DOCKER_PARAMS) {
sh '''
cd build && ctest -V -E "multigpu"
cd build && ctest -V -E "multigpu|python"
'''
}
}
Expand Down
3 changes: 2 additions & 1 deletion ci/Jenkinsfile-trt.unittests
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ cmake .. \
-DBUILD_TESTS=ON \
-DBUILD_SPDLOG=ON \
-DUSE_HTTP_SERVER_OATPP=ON \
-DUSE_CUDA_CV=ON \
-DUSE_CAFFE=OFF \
-DUSE_TENSORRT=ON \
-DUSE_TENSORRT_OSS=ON \
Expand Down Expand Up @@ -68,7 +69,7 @@ ccache -s
python3 -c 'import torch, sys; c=torch.cuda.device_count() ; print(f"CUDA VISIBLE GPU: {c}"); sys.exit(bool(c == 0 ))'
echo
echo "****************************"
cd build && ctest -V -E "multigpu"
cd build && ctest -V -E "multigpu|python"
'''
}
}
Expand Down
2 changes: 1 addition & 1 deletion ci/devel-trt.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ RUN for url in \
; do curl -L -s -o /tmp/p.deb $url && dpkg -i /tmp/p.deb && rm -rf /tmp/p.deb; done

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install torch torchvision
RUN python3 -m pip install torch

# Build OpenCV 4 with CUDA
WORKDIR /tmp
Expand Down
8 changes: 8 additions & 0 deletions src/apidata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ namespace dd
return vout();
}

#ifdef USE_CUDA_CV
vout visitor_vad::operator()(const std::vector<cv::cuda::GpuMat> &vcv)
{
(void)vcv;
return vout();
}
#endif

vout visitor_vad::operator()(const std::vector<std::pair<int, int>> &vpi)
{
(void)vpi;
Expand Down
16 changes: 16 additions & 0 deletions src/apidata.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
#include <rapidjson/writer.h>
#pragma GCC diagnostic pop
#include <opencv2/core/core.hpp>
#ifdef USE_CUDA_CV
#include <opencv2/cudaimgproc.hpp>
#endif
#include "dd_types.h"
#include <unordered_map>
#include <vector>
Expand All @@ -52,6 +55,9 @@ namespace dd
std::string, double, int, long int, long long int, bool,
std::vector<std::string>, std::vector<double>, std::vector<int>,
std::vector<bool>, std::vector<cv::Mat>,
#ifdef USE_CUDA_CV
std::vector<cv::cuda::GpuMat>,
#endif
std::vector<std::pair<int, int>>,
mapbox::util::recursive_wrapper<APIData>,
mapbox::util::recursive_wrapper<std::vector<APIData>>, oatpp::Any>
Expand Down Expand Up @@ -122,6 +128,9 @@ namespace dd
vout operator()(const std::vector<bool> &vd);
vout operator()(const std::vector<std::string> &vs);
vout operator()(const std::vector<cv::Mat> &vcv);
#ifdef USE_CUDA_CV
vout operator()(const std::vector<cv::cuda::GpuMat> &vcv);
#endif
vout operator()(const std::vector<std::pair<int, int>> &vpi);
vout operator()(const APIData &ad);
vout operator()(const std::vector<APIData> &vad);
Expand Down Expand Up @@ -495,6 +504,13 @@ namespace dd
// automatically validated.
}

#ifdef USE_CUDA_CV
void operator()(const std::vector<cv::cuda::GpuMat> &vcv)
{
(void)vcv;
}
#endif

void operator()(const std::vector<std::pair<int, int>> &vpi)
{
(void)vpi;
Expand Down
84 changes: 79 additions & 5 deletions src/backends/tensorrt/tensorrtinputconns.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <boost/math/cstdfloat/cstdfloat_types.hpp>
#ifdef USE_CUDA_CV
#include <opencv2/core/cuda_stream_accessor.hpp>
#endif

namespace dd
{
Expand Down Expand Up @@ -84,6 +87,56 @@ namespace dd
}
}

#ifdef USE_CUDA_CV
void ImgTensorRTInputFileConn::GpuMatToRTBuffer(cv::cuda::GpuMat &img, int i)
{
cv::cuda::GpuMat converted;
int channels = img.channels();

if (_cuda_buf == nullptr)
throw InputConnectorInternalException(
"No cuda buffer available to copy the data");
if (_has_mean_scalar && _mean.size() != size_t(channels))
throw InputConnectorBadParamException(
"mean vector be of size the number of channels ("
+ std::to_string(channels) + ")");

if (!_std.empty() && _std.size() != size_t(channels))
throw InputConnectorBadParamException(
"std vector be of size the number of channels ("
+ std::to_string(channels) + ")");

bool has_std = !_std.empty();

// TODO use stream for asynchronous version
// TODO Maybe preallocation too?
img.convertTo(converted, CV_32F);

std::vector<cv::cuda::GpuMat> vec_channels;
cv::cuda::split(converted, vec_channels, *_cuda_stream);

for (int c = 0; c < channels; ++c)
{
auto &channel = vec_channels.at(c);
cv::cuda::multiply(channel, _scale, channel, 1, -1, *_cuda_stream);

if (_has_mean_scalar)
cv::cuda::add(channel, -_mean[c], channel, cv::noArray(), -1,
*_cuda_stream);
if (has_std)
cv::cuda::multiply(channel, 1.0 / _std[c], channel, 1, -1,
*_cuda_stream);

int offset = _height * _width * (i * channels + c);
cudaMemcpy2DAsync(_cuda_buf + offset, _width * sizeof(float),
channel.ptr<float>(), channel.step,
_width * sizeof(float), _height,
cudaMemcpyDeviceToDevice,
cv::cuda::StreamAccessor::getStream(*_cuda_stream));
}
}
#endif

void ImgTensorRTInputFileConn::transform(
oatpp::Object<DTO::ServicePredict> input_dto)
{
Expand All @@ -103,8 +156,13 @@ namespace dd
bool set_ids = false;
if (this->_ids.empty())
set_ids = true;
size_t img_count =
#ifdef USE_CUDA_CV
_cuda ? this->_cuda_images.size() :
#endif
this->_images.size();

for (int i = 0; i < (int)this->_images.size(); i++)
for (size_t i = 0; i < img_count; i++)
{
if (set_ids)
this->_ids.push_back(this->_uris.at(i));
Expand All @@ -125,13 +183,29 @@ namespace dd
_buf.resize(batch_size * height() * width());
else
_buf.resize(batch_size * 3 * height() * width());
for (i = 0; i < batch_size && _batch_index < (int)this->_images.size();

size_t img_count =
#ifdef USE_CUDA_CV
_cuda ? this->_cuda_images.size() :
#endif
this->_images.size();

for (i = 0; i < batch_size && _batch_index < (int)img_count;
i++, _batch_index++)
{
cv::Mat img = this->_images.at(_batch_index);
CVMatToRTBuffer(img, i);
#ifdef USE_CUDA_CV
if (_cuda)
{
cv::cuda::GpuMat img = this->_cuda_images.at(_batch_index);
GpuMatToRTBuffer(img, i);
}
else
#endif
{
cv::Mat img = this->_images.at(_batch_index);
CVMatToRTBuffer(img, i);
}
}
return i;
}

}
8 changes: 7 additions & 1 deletion src/backends/tensorrt/tensorrtinputconns.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ namespace dd

bool _has_mean_file = false; /**< image model mean.binaryproto. */
std::vector<float> _buf;
float *_cuda_buf = nullptr;

float *data()
{
Expand Down Expand Up @@ -87,16 +88,21 @@ namespace dd

void transform(oatpp::Object<DTO::ServicePredict> input_dto);

int process_batch(const unsigned int batch_size);

std::string _meanfname = "mean.binaryproto";
std::string _correspname = "corresp.txt";
int _batch_index = 0;
int _batch_size = 0;
int process_batch(const unsigned int batch_size);
std::unordered_map<std::string, std::pair<int, int>>
_imgs_size; /**< image sizes, used in detection. */

private:
void CVMatToRTBuffer(cv::Mat &img, int i);

#ifdef USE_CUDA_CV
void GpuMatToRTBuffer(cv::cuda::GpuMat &img, int i);
#endif
};

}
Expand Down
73 changes: 47 additions & 26 deletions src/backends/tensorrt/tensorrtlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
#include <cuda_runtime_api.h>
#include <string>
#include "dto/service_predict.hpp"
#ifdef USE_CUDA_CV
#include <opencv2/core/cuda_stream_accessor.hpp>
#endif

namespace dd
{
Expand Down Expand Up @@ -701,8 +704,18 @@ namespace dd
}
}

cudaSetDevice(_gpuid);
cudaStream_t cstream;
cudaStreamCreate(&cstream);

TOutputConnectorStrategy tout(this->_outputc);
this->_stats.transform_start();
#ifdef USE_CUDA_CV
inputc._cuda_buf = static_cast<float *>(_buffers.data()[_inputIndex]);
auto cv_stream = cv::cuda::StreamAccessor::wrapStream(cstream);
inputc._cuda_stream = &cv_stream;
#endif

try
{
inputc.transform(predict_dto);
Expand Down Expand Up @@ -730,10 +743,6 @@ namespace dd
std::vector<APIData> vrad;
std::vector<UnsupervisedResult> unsup_results;

cudaSetDevice(_gpuid);
cudaStream_t cstream;
cudaStreamCreate(&cstream);

bool enqueue_success = false;
while (true)
{
Expand All @@ -744,16 +753,22 @@ namespace dd

try
{
if (inputc._bw)
cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
num_processed * inputc._height * inputc._width
* sizeof(float),
cudaMemcpyHostToDevice, cstream);
else
cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
num_processed * 3 * inputc._height
* inputc._width * sizeof(float),
cudaMemcpyHostToDevice, cstream);
#ifdef USE_CUDA_CV
if (!inputc._cuda)
#endif
{
if (inputc._bw)
cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
num_processed * inputc._height
* inputc._width * sizeof(float),
cudaMemcpyHostToDevice, cstream);
else
cudaMemcpyAsync(_buffers.data()[_inputIndex], inputc.data(),
num_processed * 3 * inputc._height
* inputc._width * sizeof(float),
cudaMemcpyHostToDevice, cstream);
}

if (!_explicit_batch)
enqueue_success = _context->enqueue(
num_processed, _buffers.data(), cstream, nullptr);
Expand Down Expand Up @@ -1036,20 +1051,26 @@ namespace dd
{
if (typeid(inputc) == typeid(ImgTensorRTInputFileConn))
{
auto *img_ic
= reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc);
APIData chain_input;
if (!reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
->_orig_images.empty())
chain_input.add(
"imgs", reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
->_orig_images);
#ifdef USE_CUDA_CV
if (!img_ic->_cuda_images.empty())
{
if (img_ic->_orig_images.empty())
chain_input.add("cuda_imgs", img_ic->_cuda_orig_images);
else
chain_input.add("cuda_imgs", img_ic->_cuda_images);
}
else
chain_input.add(
"imgs", reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
->_images);
chain_input.add(
"imgs_size",
reinterpret_cast<ImgTensorRTInputFileConn *>(&inputc)
->_images_size);
#endif
{
if (!img_ic->_orig_images.empty())
chain_input.add("imgs", img_ic->_orig_images);
else
chain_input.add("imgs", img_ic->_images);
}
chain_input.add("imgs_size", img_ic->_images_size);
out.add("input", chain_input);
}
}
Expand Down
Loading

0 comments on commit 93815d7

Please sign in to comment.