Support dynamic batch.

Li-Hongda · Li-Hongda · commit 55d8730a9fed · 2023-05-26T20:47:11.000+08:00
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ This repo use TensorRT-8.x to deploy well-trained models, both image preprocessi
 + 2023.05.16 🚀 Support cuda box postprocess.
 + 2023.05.19 🚀 Support cuda mask postprocess and support rtdetr.
 + 2023.05.21 🚀 Support yolov6.
++ 2023.05.26 🚀 Support dynamic batch inference.
 </details>
 
 ## 3.Support Models
@@ -57,13 +58,13 @@ All speed tests were performed on RTX 3090 with COCO Val set.The time calculated
 | Models | BatchSize | Mode | Resolution |  FPS  |
 |-|-|:-:|:-:|:-:|
 | YOLOv5-s v7.0  | 1 | FP32 | 640x640 | 200 |
-| YOLOv5-s v7.0  | 32 | FP32 | 640x640 | - |
+| YOLOv5-s v7.0  | 32 | FP32 | 640x640 | 246 |
 | YOLOv5-seg-s v7.0  | 1 | FP32 | 640x640 | 155 |
 | YOLOv6-s v3  | 1 | FP32 | 640x640 | 163 |
 | YOLOv7  | 1 | FP32 | 640x640 | 107 |
 | YOLOv8-s  | 1 | FP32 | 640x640 | 171 |
 | YOLOv8-seg-s  | 1 | FP32 | 640x640 | 122 |
-| RT-DETR  | 1 | FP32 | 640x640 | - |
+| RT-DETR  | 1 | FP32 | 640x640 | 106 |
 </div>
 
 
@@ -97,8 +98,10 @@ mkdir build && cd build
 cmake ..
 make -j$(nproc)
 ```
-4. Download the TRT engine or ONNX model and put them in `weights/MODEL_NAME`. Then modify the configuration file in `configs`.
-
+4. Get the ONNX model from the official repository and put them in `weights/MODEL_NAME`. Then modify the configuration file in `configs`.Take yolov5 as an example:
+```
+python export.py --weights=yolov5s.pt  --dynamic --simplify --include=onnx --opset 11
+```
 5. The executable file will be generated in `bin` in the repo directory if compile successfully.Then enjoy yourself with command like this:
 ```
 cd bin
@@ -107,6 +110,7 @@ cd bin
 
 > Notes:
 > 1. The output of the model is required for post-processing is num_bboxes (imageHeight x image Width) x num_pred(num_cls + coordinates + confidence),while the output of YOLOv8 is num_pred x num_bboxes,which means the predicted values of the same box are not contiguous in memory.For convenience, the corresponding dimensions of the original pytorch output need to be transposed when exporting to ONNX model.
+> 2. The dynamic shape engine is convenient but sacrifices some inference speed compared with the static model of the same batchsize.Therefore, if you want to pursue faster inference speed, it is better to export the ONNX model of fixed batchsize, such as batchsize 32.
 
 
 
diff --git a/configs/rtdetr.yaml b/configs/rtdetr.yaml
@@ -3,6 +3,7 @@ rtdetr:
     engine_file:   "../weights/rtdetr/rtdetr_hgnetl.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/configs/yolov5-seg.yaml b/configs/yolov5-seg.yaml
@@ -3,6 +3,7 @@ yolov5-seg:
     engine_file:   "../weights/yolov5/yolov5s-seg.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/configs/yolov5.yaml b/configs/yolov5.yaml
@@ -3,6 +3,7 @@ yolov5:
     engine_file:   "../weights/yolov5/yolov5s.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/configs/yolov6.yaml b/configs/yolov6.yaml
@@ -3,6 +3,7 @@ yolov6:
     engine_file:   "../weights/yolov6/yolov6s.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/configs/yolov7-p6.yaml b/configs/yolov7-p6.yaml
@@ -2,7 +2,8 @@ yolov7:
     onnx_file:     "../weights/yolov7/yolov7-w6.onnx"
     engine_file:   "../weights/yolov7/yolov7-w6.trt"
     type:          "coco"
-    mode:          "fp32"    
+    mode:          "fp32"
+    dynamic:       1  
     batchSize:     1
     imageWidth:    1280
     imageHeight:   1280
diff --git a/configs/yolov7.yaml b/configs/yolov7.yaml
@@ -3,6 +3,7 @@ yolov7:
     engine_file:   "../weights/yolov7/yolov7.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/configs/yolov8-seg.yaml b/configs/yolov8-seg.yaml
@@ -3,6 +3,7 @@ yolov8-seg:
     engine_file:   "../weights/yolov8/yolov8s-seg.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/configs/yolov8.yaml b/configs/yolov8.yaml
@@ -3,6 +3,7 @@ yolov8:
     engine_file:   "../weights/yolov8/yolov8s.trt"
     type:          "coco"
     mode:          "fp32"
+    dynamic:       1
     batchSize:     1
     imageWidth:    640
     imageHeight:   640
diff --git a/include/basemodel.h b/include/basemodel.h
@@ -19,14 +19,14 @@ class Model {
     std::string onnx_file;
     std::string engine_file;
     std::string mode;
-    std::vector<AffineMatrix> dst2src;
+    int dynamic;
     int batchSize;
     int imageWidth;
     int imageHeight;
-    std::string names[10];
-    float** cpu_buffers = new float* [10];
+    float* cpu_buffer;
     float* gpu_buffers[10]{};
     std::vector<int64_t> bufferSize;    
+    std::vector<AffineMatrix> dst2src;
     std::shared_ptr<nvinfer1::ICudaEngine> engine;
     std::unique_ptr<nvinfer1::IExecutionContext> context;
 
diff --git a/include/common.h b/include/common.h
@@ -20,11 +20,12 @@
 // cpp std
 #include<algorithm>
 #include<cstdlib>
-#include <cstring>
+#include<cstring>
 #include<math.h>
 #include<numeric>
 #include<fstream>
 #include<iostream>
+#include<iomanip>
 #include<sstream>
 #include<vector>
 #include<map>
diff --git a/src/basemodel.cpp b/src/basemodel.cpp
@@ -4,6 +4,7 @@ Model::Model(const YAML::Node &config) {
     onnx_file = config["onnx_file"].as<std::string>();
     engine_file = config["engine_file"].as<std::string>();
     mode = config["mode"].as<std::string>();
+    dynamic = config["dynamic"].as<int>();
     batchSize = config["batchSize"].as<int>();
     imageWidth = config["imageWidth"].as<int>();
     imageHeight = config["imageHeight"].as<int>();
@@ -20,12 +21,21 @@ Model::~Model() {
 
 void Model::OnnxToTRTModel() {
     // create the builder
-    nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger());
+    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger());
     assert(builder != nullptr);
 
     const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
     auto network = builder->createNetworkV2(explicitBatch);
     auto config = builder->createBuilderConfig();
+    if (dynamic) {
+        nvinfer1::IOptimizationProfile* profile = builder->createOptimizationProfile();
+        profile->setDimensions("images", nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1,3,imageWidth,imageHeight));
+        profile->setDimensions("images", nvinfer1::OptProfileSelector::kOPT, nvinfer1::Dims4(8,3,imageWidth,imageHeight));
+        profile->setDimensions("images", nvinfer1::OptProfileSelector::kMAX, nvinfer1::Dims4(32,3,imageWidth,imageHeight));
+
+        config->addOptimizationProfile(profile);
+    }
+
 
     auto parser = nvonnxparser::createParser(*network, sample::gLogger.getTRTLogger());
     if (!parser->parseFromFile(onnx_file.c_str(), static_cast<int>(sample::gLogger.getReportableSeverity()))) {
@@ -36,6 +46,7 @@ void Model::OnnxToTRTModel() {
     if (mode == "fp16")
         config->setFlag(nvinfer1::BuilderFlag::kFP16);
     else if  (mode == "int8")
+        // TODO: support int8 calibrate
         config->setFlag(nvinfer1::BuilderFlag::kINT8);
 
     nvinfer1::IHostMemory* data = builder->buildSerializedNetwork(*network, *config);
@@ -95,18 +106,24 @@ void Model::LoadEngine(){
     bufferSize.resize(nbBindings);
     for (int i = 0; i < nbBindings; ++i) {
         nvinfer1::Dims dims = engine->getBindingDimensions(i);
+        if (dims.d[0] == -1)
+                dims.d[0] = batchSize;
         nvinfer1::DataType dtype = engine->getBindingDataType(i);
-        names[i] = engine->getBindingName(i);
         int64_t totalSize = sample::volume(dims) * sample::dataTypeSize(dtype);
-        cpu_buffers[i] = (float* )malloc(totalSize);
         bufferSize[i] = totalSize;
         CUDA_CHECK(cudaMalloc(&gpu_buffers[i], totalSize));
     }
+    cpu_buffer = (float* )malloc(1000 * 100 * sizeof(float));
+    if (dynamic) {
+        this->context->setOptimizationProfile(0);
+        this->context->setBindingDimensions(0, nvinfer1::Dims4(batchSize, 3, imageHeight, imageWidth));        
+    }
     //get stream  
     cudaStreamCreate(&stream);
 }
 
 void Model::PreProcess(std::vector<cv::Mat>& img_batch) {
+    int size = imageWidth * imageHeight * 3;
     dst2src.reserve(batchSize);
     for (size_t i = 0; i < img_batch.size(); i++) {
         int height = img_batch[i].rows; 
@@ -119,7 +136,7 @@ void Model::PreProcess(std::vector<cv::Mat>& img_batch) {
         AffineMatrix mat;
         memcpy(&mat, d2s.ptr(), sizeof(mat));
         dst2src.emplace_back(mat);
-        preprocess(img_batch[i].ptr(), mat, width, height, &gpu_buffers[0][bufferSize[0] * i], imageWidth, imageHeight, stream); 
+        preprocess(img_batch[i].ptr(), mat, width, height, &gpu_buffers[0][size * i], imageWidth, imageHeight, stream); 
         CUDA_CHECK(cudaStreamSynchronize(stream));
     }
-}
+}
diff --git a/src/cuda_function.cu b/src/cuda_function.cu
@@ -35,7 +35,7 @@ void cuda_postprocess_destroy() {
 	CUDA_CHECK(cudaFreeHost(out_buffer_host));
 	CUDA_CHECK(cudaFree(out_buffer_device));
 	CUDA_CHECK(cudaFree(out_mask_buffer_device));
-	// CUDA_CHECK(cudaFree(single_out_buffer_device));	
+	CUDA_CHECK(cudaFree(single_out_buffer_device));	
 }
 
 
@@ -159,6 +159,7 @@ static __global__ void fast_nms_kernel(float* bboxes, float threshold, int num_o
             }
         }
     }
+    // printf("%d", bboxes[0]);
 }
 
 static __global__ void decode_box_kernel(float* predict, int num_bboxes, int num_out,
@@ -508,7 +509,7 @@ void yolov8_postprocess_box_mask(float* predict, int num_bboxes, int num_classes
 
 void process_mask(float* out, float* proto, uint8_t* dst , int num_out, 
                   int dst_width, int dst_height, int out_w, int proto_size, cudaStream_t stream) {
-	int threads = 256;
+	int threads = 512;
 	int blocks = ceil(proto_size / threads);
  
 	CUDA_CHECK(cudaMemcpyAsync(single_out_buffer_device, out, 
diff --git a/src/detection.cpp b/src/detection.cpp
@@ -96,7 +96,9 @@ void Detection::Inference(const std::string &input_path, const std::string &save
             imgInfo.clear(); 
         }
     }
-    delete [] cpu_buffers;
+    delete cpu_buffer;
+    cuda_preprocess_destroy();
+    cuda_postprocess_destroy();
     std::cout << "Average processing time is " << total_time / image_list.size() << "ms " << std::endl;
     std::cout << "Average FPS is " << 1000 * image_list.size() / total_time << std::endl;
 }
diff --git a/src/instance_segmentation.cpp b/src/instance_segmentation.cpp
@@ -77,7 +77,6 @@ void InstanceSegmentation::Inference(const std::string &input_path, const std::s
     imgInfo.reserve(batchSize);
     float total_time = 0;
     cuda_preprocess_init(maxImageSize);
-    // cuda_postprocess_init(39, imageWidth, imageHeight);
     for (const std::string &image_name : image_list) {
         index++;
         // TODO: figure out why double free.
diff --git a/src/rtdetr.cpp b/src/rtdetr.cpp
@@ -21,9 +21,10 @@ std::vector<Detections> RTDETR::InferenceImages(std::vector<cv::Mat> &imgBatch)
     auto boxes = PostProcess(imgBatch, gpu_buffers[1], gpu_buffers[2]);
     auto t_end_post = std::chrono::high_resolution_clock::now();
     float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
-    std::cout << "preprocess time: "<< total_pre << "ms " <<
-    "detection inference time: " << total_inf << "ms " 
-    "postprocess time: " << total_post << "ms " << std::endl; 
+    std::cout << std::fixed << std::setprecision(4) << 
+    "batch preprocess time: "<< total_pre << "ms " <<
+    "batch inference time: " << total_inf << "ms " 
+    "batch postprocess time: " << total_post << "ms " << std::endl; 
     return boxes;
 }
 
@@ -39,11 +40,11 @@ std::vector<Detections> RTDETR::PostProcess(const std::vector<cv::Mat> &imgBatch
         float* score_per_img = output2 + index * predscoreSize;
         cuda_postprocess_init(6, imageWidth, imageHeight);
         rtdetr_postprocess_box(box_per_img, score_per_img, num_bboxes, num_classes, 6, 
-                               conf_thr, imageWidth, imageHeight, dst2src[index], stream, cpu_buffers[2]);
-        int num_boxes = std::min((int)cpu_buffers[2][0], 300);
+                               conf_thr, imageWidth, imageHeight, dst2src[index], stream, cpu_buffer);
+        int num_boxes = std::min((int)cpu_buffer[0], 300);
         for (int i = 0; i < num_boxes; i++) {
             Box box;
-            float* ptr = cpu_buffers[2] + 1 + 6 * i;
+            float* ptr = cpu_buffer + 1 + 6 * i;
             box.x = ptr[0];
             box.y = ptr[1];
             box.w = ptr[2];
diff --git a/src/yolo.cpp b/src/yolo.cpp
@@ -9,7 +9,7 @@ YOLO::YOLO(const YAML::Node &config) : Detection(config) {
     {
         num_bboxes += int(imageHeight / stride) * int(imageWidth / stride) * 3;
         index+=1;
-    }         
+    } 
 }
 
 std::vector<Detections> YOLO::InferenceImages(std::vector<cv::Mat> &imgBatch) noexcept{
@@ -29,25 +29,26 @@ std::vector<Detections> YOLO::InferenceImages(std::vector<cv::Mat> &imgBatch) no
     auto boxes = PostProcess(imgBatch, gpu_buffers[1]);
     auto t_end_post = std::chrono::high_resolution_clock::now();
     float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
-    std::cout << "preprocess time: "<< total_pre << "ms " <<
-    "detection inference time: " << total_inf << "ms " 
-    "postprocess time: " << total_post << "ms " << std::endl; 
+    std::cout << std::fixed << std::setprecision(4) << 
+    "batch preprocess time: "<< total_pre << "ms " <<
+    "batch inference time: " << total_inf << "ms " 
+    "batch postprocess time: " << total_post << "ms " << std::endl;  
     return boxes;
 }
 
 std::vector<Detections> YOLO::PostProcess(const std::vector<cv::Mat> &imgBatch, float* output) {
     std::vector<Detections> vec_result;
     int index = 0;
-    auto predSize = bufferSize[1] / sizeof(float);
+    auto predSize = bufferSize[1] / batchSize / sizeof(float);
     for (const cv::Mat &img : imgBatch) {
         Detections result;
         float* pred_per_img = output + index * predSize;
         cuda_postprocess_init(7, imageWidth, imageHeight);
-        postprocess_box(pred_per_img, num_bboxes, num_classes, 7, conf_thr, nms_thr, dst2src[index], stream, cpu_buffers[1]);
-        int num_boxes = std::min((int)cpu_buffers[1][0], 1000);
+        postprocess_box(pred_per_img, num_bboxes, num_classes, 7, conf_thr, nms_thr, dst2src[index], stream, cpu_buffer);
+        int num_boxes = std::min((int)cpu_buffer[0], 1000);        
         for (int i = 0; i < num_boxes; i++) {
             Box box;
-            float* ptr = cpu_buffers[1] + 1 + 7 * i;
+            float* ptr = cpu_buffer + 1 + 7 * i;
             if (!ptr[6]) continue;
             box.x = ptr[0];
             box.y = ptr[1];
@@ -87,32 +88,32 @@ std::vector<Segmentations> YOLO_seg::InferenceImages(std::vector<cv::Mat> &imgBa
     this->context->enqueueV2(gpu_buf, stream, nullptr); 
     auto t_end = std::chrono::high_resolution_clock::now();
     float total_inf = std::chrono::duration<float, std::milli>(t_end - t_start).count();
-    // CUDA_CHECK(cudaMemcpyAsync(cpu_buffers[1], gpu_buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream));    
     auto t_start_post = std::chrono::high_resolution_clock::now();
     auto boxes = PostProcess(imgBatch, gpu_buffers[1], gpu_buffers[2]);
     auto t_end_post = std::chrono::high_resolution_clock::now();
     float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
-    std::cout << "preprocess time: "<< total_pre << "ms " <<
-    "detection inference time: " << total_inf << "ms " 
-    "postprocess time: " << total_post << "ms " << std::endl;
+    std::cout << std::fixed << std::setprecision(4) << 
+    "batch preprocess time: "<< total_pre << "ms " <<
+    "batch inference time: " << total_inf << "ms " 
+    "batch postprocess time: " << total_post << "ms " << std::endl; 
     return boxes;
 }
 
 std::vector<Segmentations> YOLO_seg::PostProcess(const std::vector<cv::Mat> &imgBatch, float* output1, float* output2) {
     std::vector<Segmentations> vec_result;
     int index = 0;
-    auto protoSize = bufferSize[1] / sizeof(float);
-    auto predSize = bufferSize[2] / sizeof(float);
-    cuda_postprocess_init(39, imageWidth, imageHeight);
+    auto protoSize = bufferSize[1] / batchSize / sizeof(float);
+    auto predSize = bufferSize[2] / batchSize / sizeof(float);
     for (const cv::Mat &img : imgBatch){
         Segmentations result;
         float* proto = output1 + index * protoSize;
         float* pred_per_img = output2 + index * predSize;
-        postprocess_box_mask(pred_per_img, num_bboxes, num_classes, 39, conf_thr, nms_thr, dst2src[index], stream, cpu_buffers[1]);
-        int num_boxes = std::min((int)cpu_buffers[1][0], 1000);
+        cuda_postprocess_init(39, imageWidth, imageHeight);
+        postprocess_box_mask(pred_per_img, num_bboxes, num_classes, 39, conf_thr, nms_thr, dst2src[index], stream, cpu_buffer);
+        int num_boxes = std::min((int)cpu_buffer[0], 1000);
         for (int i = 0; i < num_boxes; i++) {
             Instance ins;
-            float* ptr = cpu_buffers[1] + 1 + 39 * i;
+            float* ptr = cpu_buffer + 1 + 39 * i;
             if (!ptr[6]) continue;
             ins.x = ptr[0];
             ins.y = ptr[1];
diff --git a/src/yolov8.cpp b/src/yolov8.cpp