Skip to content

Commit

Permalink
feat(trt): add int8 inference
Browse files Browse the repository at this point in the history
  • Loading branch information
Bycob authored and mergify[bot] committed Jan 14, 2023
1 parent 5a02234 commit a212a8e
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 19 deletions.
153 changes: 153 additions & 0 deletions src/backends/tensorrt/tensorrtcalibrator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/**
* DeepDetect
* Copyright (c) 2023 Jolibrain
* Author: Louis Jean <louis.jean@jolibrain.com>
*
* This file is part of deepdetect.
*
* deepdetect is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* deepdetect is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with deepdetect. If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef TENSORRTCALIBRATOR_H
#define TENSORRTCALIBRATOR_H

#include <fstream>

#include "mllibstrategy.h"
#include "NvInfer.h"

namespace dd
{
template <typename TConnector>
class TRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2
{
public:
TRTCalibrator(TConnector *connector, const std::string &model_repo,
int max_batch_size, bool use_cache,
std::shared_ptr<spdlog::logger> logger)
: _conn{ connector }, _logger(logger), _max_batch_size(max_batch_size),
_use_cache(use_cache),
_calibration_table_path(model_repo + "/calibration_table")
{
if (!use_cache)
{
// XXX(louis): works only for images
if (_conn->_bw)
_input_size = _conn->_height * _conn->_width;
else
_input_size = _conn->_height * _conn->_width * 3;

auto result = cudaMalloc(&_input_buf, _max_batch_size * _input_size
* sizeof(float));

if (result)
throw MLLibInternalException(
"Could not allocate input buffer for "
"model calibration (size="
+ std::to_string(_input_size * _max_batch_size) + ")");
#ifdef USE_CUDA_CV
_conn->_cuda_buf = static_cast<float *>(_input_buf);
#endif
}
}

virtual ~TRTCalibrator()
{
cudaFree(_input_buf);
}

int getBatchSize() const noexcept override
{
return _max_batch_size;
}

/** Returns next batch from input connector */
bool getBatch(void *bindings[], const char *names[],
int nbBindings) noexcept override
{
// only one binding
(void)names;
(void)nbBindings;

if (_use_cache)
return false;

int num_processed = _conn->process_batch(_max_batch_size);
if (num_processed == 0)
return false;
#ifdef USE_CUDA_CV
if (!_conn->_cuda)
#endif
{
bool result
= cudaMemcpyAsync(_input_buf, _conn->data(),
num_processed * _input_size * sizeof(float),
cudaMemcpyHostToDevice);
if (result)
return false;
}
bindings[0] = _input_buf;
return true;
}

/** read calibration table from disk */
const void *readCalibrationCache(size_t &length) noexcept override
{
if (!_use_cache)
return nullptr;

_calibration_cache.clear();
_logger->info("reading cache at {}", _calibration_table_path);
std::ifstream input(_calibration_table_path, std::ios::binary);
input >> std::noskipws;
if (input.good())
{
// TODO logger
std::copy(std::istream_iterator<char>(input),
std::istream_iterator<char>(),
std::back_inserter(_calibration_cache));
}
else
{
_logger->error(
"No int8 calibration data found, please run a calibration "
"inference with mllib.calibration = true");
}
length = _calibration_cache.size();
return _calibration_cache.data();
}

/** write calibration table to disk */
void writeCalibrationCache(const void *cache,
size_t length) noexcept override
{
std::ofstream output(_calibration_table_path, std::ios::binary);
output.write(reinterpret_cast<const char *>(cache), length);
}

private:
TConnector *_conn;
std::shared_ptr<spdlog::logger> _logger;

int _max_batch_size;
/// input size (not batched)
int _input_size;
void *_input_buf = nullptr;
bool _use_cache{ true };
std::string _calibration_table_path;
std::vector<char> _calibration_cache;
};
}

#endif
2 changes: 1 addition & 1 deletion src/backends/tensorrt/tensorrtinputconns.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

namespace dd
{

class TensorRTInputInterface
{
public:
Expand Down Expand Up @@ -104,7 +105,6 @@ namespace dd
void GpuMatToRTBuffer(cv::cuda::GpuMat &img, int i);
#endif
};

}

#endif
70 changes: 63 additions & 7 deletions src/backends/tensorrt/tensorrtlib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "tensorrtlib.h"
#include "utils/apitools.h"
#include "tensorrtinputconns.h"
#include "tensorrtcalibrator.hpp"
#include "NvInferPlugin.h"
#include "../parsers/onnx/NvOnnxParser.h"
#include "protoUtils.h"
Expand All @@ -39,8 +40,26 @@ namespace dd

static TRTLogger trtLogger;

static std::string dtype_to_str(nvinfer1::DataType dtype)
{
switch (dtype)
{
case nvinfer1::DataType::kFLOAT:
return "fp32";
case nvinfer1::DataType::kHALF:
return "fp16";
case nvinfer1::DataType::kINT32:
return "int32";
case nvinfer1::DataType::kINT8:
return "int8";
default:
throw MLLibInternalException("Unsupported datatype: "
+ std::to_string(int(dtype)));
}
}

static int findEngineBS(std::string repo, std::string engineFileName,
std::string arch)
std::string arch, nvinfer1::DataType dtype)
{
std::unordered_set<std::string> lfiles;
fileops::list_directory(repo, true, false, false, lfiles);
Expand All @@ -51,7 +70,8 @@ namespace dd
if (fstart == std::string::npos)
fstart = 0;

if (s.find(engineFileName + "_arch" + arch, fstart)
if (s.find(engineFileName + "_arch" + arch + "_" + dtype_to_str(dtype),
fstart)
!= std::string::npos)
{
std::string bs_str;
Expand Down Expand Up @@ -320,7 +340,7 @@ namespace dd

// remove compiled model files.
std::vector<std::string> extensions
= { "TRTengine", "net_tensorRT.proto" };
= { "TRTengine", "net_tensorRT.proto", "calibration_table" };
fileops::remove_directory_files(this->_mlmodel._repo, extensions);
}

Expand Down Expand Up @@ -385,6 +405,8 @@ namespace dd

_builderc->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE,
_max_workspace_size);
if (_calibrator)
_builderc->setInt8Calibrator(_calibrator.get());

network->getLayer(0)->setPrecision(nvinfer1::DataType::kFLOAT);

Expand Down Expand Up @@ -436,6 +458,10 @@ namespace dd
"Error while parsing onnx model for conversion to "
"TensorRT");
}

if (_calibrator)
_builderc->setInt8Calibrator(_calibrator.get());

// TODO check with onnx models dynamic shape
this->_logger->warn("Onnx model: max batch size not used");
_builderc->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE,
Expand Down Expand Up @@ -503,6 +529,7 @@ namespace dd

std::string out_blob = "prob";
std::string extract_layer = predict_dto->parameters->mllib->extract_layer;
bool calibration = predict_dto->parameters->mllib->calibration;

TInputConnectorStrategy inputc(this->_inputc);

Expand Down Expand Up @@ -571,13 +598,14 @@ namespace dd

bool engineRead = false;
std::string engine_path = this->_mlmodel._repo + "/" + _engineFileName
+ "_arch" + _arch + "_bs"
+ "_arch" + _arch + "_"
+ dtype_to_str(_datatype) + "_bs"
+ std::to_string(_max_batch_size);

if (_readEngine)
{
int bs
= findEngineBS(this->_mlmodel._repo, _engineFileName, _arch);
int bs = findEngineBS(this->_mlmodel._repo, _engineFileName, _arch,
_datatype);
if (bs != _max_batch_size && bs != -1)
{
throw MLLibBadParamException(
Expand Down Expand Up @@ -638,6 +666,31 @@ namespace dd
if (!engineRead)
{
nvinfer1::ICudaEngine *le = nullptr;
if (_datatype == nvinfer1::DataType::kINT8)
{
if (calibration)
{
try
{
inputc.transform(predict_dto);
}
catch (...)
{
throw;
}
}

bool calibrate_from_cache = !calibration;
if (calibrate_from_cache)
this->_logger->info(
"Setting up the int8 calibrator using cache");
else
this->_logger->info(
"Setting up the int8 calibrator using test data");
_calibrator.reset(new TRTCalibrator<TInputConnectorStrategy>(
&inputc, this->_mlmodel._repo, _max_batch_size,
calibrate_from_cache, this->_logger));
}

if (this->_mlmodel._model.find("net_tensorRT.proto")
!= std::string::npos
Expand All @@ -664,6 +717,9 @@ namespace dd
p.write(reinterpret_cast<const char *>(trtModelStream->data()),
trtModelStream->size());
}

// once the engine is built, calibrator is not needed anymore
_calibrator = nullptr;
}
else
{
Expand Down Expand Up @@ -782,7 +838,7 @@ namespace dd
TOutputConnectorStrategy tout(this->_outputc);
this->_stats.transform_start();
#ifdef USE_CUDA_CV
inputc._cuda_buf = static_cast<float *>(_buffers.data()[_inputIndex]);
inputc._cuda_buf = static_cast<float *>(_buffers.at(_inputIndex));
auto cv_stream = cv::cuda::StreamAccessor::wrapStream(cstream);
inputc._cuda_stream = &cv_stream;
#endif
Expand Down
1 change: 1 addition & 0 deletions src/backends/tensorrt/tensorrtlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ namespace dd
_template; /**< template for models that require specific treatment */

//!< The TensorRT engine used to run the network
std::shared_ptr<nvinfer1::IInt8Calibrator> _calibrator = nullptr;
std::shared_ptr<nvinfer1::ICudaEngine> _engine = nullptr;
std::shared_ptr<nvinfer1::IBuilder> _builder = nullptr;
std::shared_ptr<nvinfer1::IExecutionContext> _context = nullptr;
Expand Down
9 changes: 9 additions & 0 deletions src/dto/mllib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,15 @@ namespace dd
}
DTO_FIELD(String, forward_method) = "";

// =====
// TensorRT Options
DTO_FIELD_INFO(calibration)
{
info->description
= "whether this image should be used to calibrate the model";
};
DTO_FIELD(Boolean, calibration) = false;

// =====
// NCNN Options
DTO_FIELD_INFO(threads)
Expand Down
Loading

0 comments on commit a212a8e

Please sign in to comment.