update readme

Li-Hongda · Li-Hongda · commit 7c83f2b0ff86 · 2023-05-20T17:27:07.000+08:00
diff --git a/README.md b/README.md
@@ -13,39 +13,48 @@
 </div>
 
 ## 1.Introduction
-This repo use TensorRT-8.x to deploy well-trained models.
-
+This repo use TensorRT-8.x to deploy well-trained models, both image preprocessing and postprocessing are performed with CUDA, which realizes high-speed inference.
 ## 2.Update
+<details open>
+<summary>update process</summary>
 
-- [x] [YOLOv5](https://github.com/ultralytics/yolov5) (sd)
-- [x] [YOLOv5-seg](https://github.com/ultralytics/yolov5)
-- [x] [YOLOv7](https://github.com/WongKinYiu/yolov7)
-- [x] [YOLOv8](https://github.com/ultralytics/ultralytics)
-- [x] [YOLOv8-seg](https://github.com/ultralytics/ultralytics)
-
-
++ 2023.05.01 🚀 Create the repo.
++ 2023.05.03 🚀 Support yolov5 detection.
++ 2023.05.05 🚀 Support yolov7 and yolov5 instance-segmentation.
++ 2023.05.10 🚀 Support yolov8 detection and instance-segmentation.
++ 2023.05.12 🚀 Support cuda preprocess for speed up.
++ 2023.05.16 🚀 Support cuda box postprocess.
++ 2023.05.19 🚀 Support cuda mask postprocess and support rtdetr.
+</details>
 
 ## 3.Support Models
-All speed tests were performed on RTX 3090 with COCO Val set.The time calculated here is the sum of the time of image preprocess, inference and postprocess, since image loading and visualizing are not counted in, the actual spedd will be a little slower.
+<details open>
+<summary>supported models</summary>
+- [x] [YOLOv5](https://github.com/ultralytics/yolov5)<br>
+- [x] [YOLOv5-seg](https://github.com/ultralytics/yolov5)<br>
+- [x] [YOLOv7](https://github.com/WongKinYiu/yolov7)<br>
+- [x] [YOLOv8](https://github.com/ultralytics/ultralytics)<br>
+- [x] [YOLOv8-seg](https://github.com/ultralytics/ultralytics)<br>
+- [x] [RT-DETR](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/rtdetr)<br>
+- [] [YOLOv6](https://github.com/meituan/YOLOv6) (to be continued)<br>
+- [] [YOLO-NAS](https://github.com/Deci-AI/super-gradients) (to be continued)<br>
+</details>
+
+All speed tests were performed on RTX 3090 with COCO Val set.The time calculated here is the sum of the time of image loading, preprocess, inference and postprocess, so it's going to be slower than what's reported in the paper.
+<div align='center'>
 
-| Models | BatchSize | Mode | Input Shape(HxW) | FPS* | FPS  |
-|-|-|:-:|:-:|:-:|:-:|
-| YOLOv5-n v7.0 | 1 | FP32 | 640x640 | 724 | 
+| Models | BatchSize | Mode | Input Shape(HxW) |  FPS  |
+|-|-|:-:|:-:|:-:|
 | YOLOv5-s v7.0  | 1 | FP32 | 640x640 | 468 |
 | YOLOv5-s v7.0  | 32 | FP32 | 640x640 | - |
-| YOLOv5-m v7.0  | 1 | FP32 | 640x640 | 270 |
-| YOLOv5-l v7.0  | 1 | FP32 | 640x640 | 151 |
-| YOLOv5-x v7.0  | 1 | FP32 | 640x640 | 94  |
+| YOLOv5-seg-s v7.0  | 1 | FP32 | 640x640 | - |
 | YOLOv7  | 1 | FP32 | 640x640 | 154 |
-| YOLOv7x  | 1 | FP32 | 640x640 | - | - |
-| YOLOv8-n  | 1 | FP32 | 640x640 | 390 | 127 |
-| YOLOv8-s  | 1 | FP32 | 640x640 | 171 | 101 |
-| YOLOv8-m  | 1 | FP32 | 640x640 | 122 |
-| YOLOv8-l  | 1 | FP32 | 640x640 | 88 |
-| YOLOv8-x  | 1 | FP32 | 640x640 | 68 |
-| RT-DETR  | 1 | FP32 | 640x640 | - | - |
-| RT-DETR  | 1 | FP32 | 640x640 | - | - |
-+ FPS* means that the time of image loading, image processing and visualization are taken into account when calculating.FPS only counts image processing time(preprocess, inference, postprocess).
+| YOLOv8-s  | 1 | FP32 | 640x640 | 171 |
+| YOLOv8-s  | 1 | FP32 | 640x640 | - |
+| RT-DETR  | 1 | FP32 | 640x640 | - |
+| RT-DETR  | 1 | FP32 | 640x640 | - |
+</div>
+
 
 ## 4.Usage
 1. Clone the repo.
@@ -65,4 +74,12 @@ cd bin
 ./object_detection yolov5 /path/to/input/dir 
 ```
 
+## 5.Reference
+[0].https://github.com/NVIDIA/TensorRT<br>
+[1].https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#c_topics<br>
+[2].https://github.com/linghu8812/tensorrt_inference<br>
+[3].https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#<br>
+[4].https://blog.csdn.net/bobchen1017?type=blog<br>
+
+
 
diff --git a/include/build.h b/include/build.h
@@ -2,13 +2,11 @@
 #define BUILD_H
 
 
-// #include "Swin-Transformer.h"
 #include "yolov5.h"
-// #include "YOLOv6.h"
+// #include "yolov6.h"
 #include "yolov7.h"
 #include "yolov8.h"
 #include "rtdetr.h"
 
 std::shared_ptr<Model> build_model(std::string model_arch, std::string cfg);
-// char **argv
 #endif
diff --git a/include/detection.h b/include/detection.h
@@ -17,32 +17,34 @@ namespace Category {
     };
     const std::vector<std::string> voc = {
         "aeroplane","bicycle","bird","boat","bottle","bus","car","cat","chair","cow","diningtable",
-        "dog","horse","motorbike","person","pottedplant","sheep","sofa","train","tvmonitor"
+        "dog","horse","motorbike","person","pottedplant","sheep","sofa","train","tv/monitor"
     };
 }
 
 namespace Color {
     const std::vector<cv::Scalar> coco { 
-        cv::Scalar(128, 77, 207),cv::Scalar(65, 32, 208),cv::Scalar(0, 224, 45),cv::Scalar(3, 141, 219),cv::Scalar(80, 239, 253),cv::Scalar(239, 184, 12),
-        cv::Scalar(7, 144, 145),cv::Scalar(161, 88, 57),cv::Scalar(0, 166, 46),cv::Scalar(218, 113, 53),cv::Scalar(193, 33, 128),cv::Scalar(190, 94, 113),
-        cv::Scalar(113, 123, 232),cv::Scalar(69, 205, 80),cv::Scalar(18, 170, 49),cv::Scalar(89, 51, 241),cv::Scalar(153, 191, 154),cv::Scalar(27, 26, 69),
-        cv::Scalar(20, 186, 194),cv::Scalar(210, 202, 167),cv::Scalar(196, 113, 204),cv::Scalar(9, 81, 88),cv::Scalar(191, 162, 67),cv::Scalar(227, 73, 120),
-        cv::Scalar(177, 31, 19),cv::Scalar(133, 102, 137),cv::Scalar(146, 72, 97),cv::Scalar(145, 243, 208),cv::Scalar(2, 184, 176),cv::Scalar(219, 220, 93),
-        cv::Scalar(238, 253, 234),cv::Scalar(197, 169, 160),cv::Scalar(204, 201, 106),cv::Scalar(13, 24, 129),cv::Scalar(40, 38, 4),cv::Scalar(5, 41, 34),
-        cv::Scalar(46, 94, 129),cv::Scalar(102, 65, 107),cv::Scalar(27, 11, 208),cv::Scalar(191, 240, 183),cv::Scalar(225, 76, 38),cv::Scalar(193, 89, 124),
-        cv::Scalar(30, 14, 175),cv::Scalar(144, 96, 90),cv::Scalar(181, 186, 86),cv::Scalar(102, 136, 34),cv::Scalar(158, 71, 15),cv::Scalar(183, 81, 247),
-        cv::Scalar(73, 69, 89),cv::Scalar(123, 73, 232),cv::Scalar(4, 175, 57),cv::Scalar(87, 108, 23),cv::Scalar(105, 204, 142),cv::Scalar(63, 115, 53),
-        cv::Scalar(105, 153, 126),cv::Scalar(247, 224, 137),cv::Scalar(136, 21, 188),cv::Scalar(122, 129, 78),cv::Scalar(145, 80, 81),cv::Scalar(51, 167, 149),
-        cv::Scalar(162, 173, 20),cv::Scalar(252, 202, 17),cv::Scalar(10, 40, 3),cv::Scalar(150, 90, 254),cv::Scalar(169, 21, 68),cv::Scalar(157, 148, 180),
-        cv::Scalar(131, 254, 90),cv::Scalar(7, 221, 102),cv::Scalar(19, 191, 184),cv::Scalar(98, 126, 199),cv::Scalar(210, 61, 56),cv::Scalar(252, 86, 59),
-        cv::Scalar(102, 195, 55),cv::Scalar(160, 26, 91),cv::Scalar(60, 94, 66),cv::Scalar(204, 169, 193),cv::Scalar(126, 4, 181),cv::Scalar(229, 209, 196),
-        cv::Scalar(195, 170, 186),cv::Scalar(155, 207, 148)
+        cv::Scalar(220, 20, 60), cv::Scalar(119, 11, 32), cv::Scalar(0, 0, 142), cv::Scalar(0, 0, 230), cv::Scalar(106, 0, 228), 
+        cv::Scalar(0, 60, 100), cv::Scalar(0, 80, 100), cv::Scalar(0, 0, 70), cv::Scalar(0, 0, 192), cv::Scalar(250, 170, 30), 
+        cv::Scalar(100, 170, 30), cv::Scalar(220, 220, 0), cv::Scalar(175, 116, 175), cv::Scalar(250, 0, 30), cv::Scalar(165, 42, 42), 
+        cv::Scalar(255, 77, 255), cv::Scalar(0, 226, 252), cv::Scalar(182, 182, 255), cv::Scalar(0, 82, 0), cv::Scalar(120, 166, 157), 
+        cv::Scalar(110, 76, 0), cv::Scalar(174, 57, 255), cv::Scalar(199, 100, 0), cv::Scalar(72, 0, 118), cv::Scalar(255, 179, 240), 
+        cv::Scalar(0, 125, 92), cv::Scalar(209, 0, 151), cv::Scalar(188, 208, 182), cv::Scalar(0, 220, 176), cv::Scalar(255, 99, 164), 
+        cv::Scalar(92, 0, 73), cv::Scalar(133, 129, 255), cv::Scalar(78, 180, 255), cv::Scalar(0, 228, 0), cv::Scalar(174, 255, 243), 
+        cv::Scalar(45, 89, 255), cv::Scalar(134, 134, 103), cv::Scalar(145, 148, 174), cv::Scalar(255, 208, 186), cv::Scalar(197, 226, 255), 
+        cv::Scalar(171, 134, 1), cv::Scalar(109, 63, 54), cv::Scalar(207, 138, 255), cv::Scalar(151, 0, 95), cv::Scalar(9, 80, 61), 
+        cv::Scalar(84, 105, 51), cv::Scalar(74, 65, 105), cv::Scalar(166, 196, 102), cv::Scalar(208, 195, 210), cv::Scalar(255, 109, 65), 
+        cv::Scalar(0, 143, 149), cv::Scalar(179, 0, 194), cv::Scalar(209, 99, 106), cv::Scalar(5, 121, 0), cv::Scalar(227, 255, 205), 
+        cv::Scalar(147, 186, 208), cv::Scalar(153, 69, 1), cv::Scalar(3, 95, 161), cv::Scalar(163, 255, 0), cv::Scalar(119, 0, 170), 
+        cv::Scalar(0, 182, 199), cv::Scalar(0, 165, 120), cv::Scalar(183, 130, 88), cv::Scalar(95, 32, 0), cv::Scalar(130, 114, 135), 
+        cv::Scalar(110, 129, 133), cv::Scalar(166, 74, 118), cv::Scalar(219, 142, 185), cv::Scalar(79, 210, 114), cv::Scalar(178, 90, 62), 
+        cv::Scalar(65, 70, 15), cv::Scalar(127, 167, 115), cv::Scalar(59, 105, 106), cv::Scalar(142, 108, 45), cv::Scalar(196, 172, 0), 
+        cv::Scalar(95, 54, 80), cv::Scalar(128, 76, 255), cv::Scalar(201, 57, 1), cv::Scalar(246, 0, 122), cv::Scalar(191, 162, 208)
     };
     const std::vector<cv::Scalar> voc {
-        cv::Scalar(128, 77, 207),cv::Scalar(65, 32, 208),cv::Scalar(0, 224, 45),cv::Scalar(3, 141, 219),cv::Scalar(80, 239, 253),cv::Scalar(239, 184, 12),
-        cv::Scalar(7, 144, 145),cv::Scalar(161, 88, 57),cv::Scalar(0, 166, 46),cv::Scalar(218, 113, 53),cv::Scalar(193, 33, 128),cv::Scalar(190, 94, 113),
-        cv::Scalar(113, 123, 232),cv::Scalar(69, 205, 80),cv::Scalar(18, 170, 49),cv::Scalar(89, 51, 241),cv::Scalar(153, 191, 154),cv::Scalar(27, 26, 69),
-        cv::Scalar(20, 186, 194),cv::Scalar(210, 202, 167),cv::Scalar(196, 113, 204),cv::Scalar(9, 81, 88),cv::Scalar(191, 162, 67),cv::Scalar(227, 73, 120)
+        cv::Scalar(106, 0, 228), cv::Scalar(119, 11, 32), cv::Scalar(165, 42, 42), cv::Scalar(0, 0, 192), cv::Scalar(197, 226, 255), 
+        cv::Scalar(0, 60, 100), cv::Scalar(0, 0, 142), cv::Scalar(255, 77, 255), cv::Scalar(153, 69, 1), cv::Scalar(120, 166, 157), 
+        cv::Scalar(0, 182, 199), cv::Scalar(0, 226, 252), cv::Scalar(182, 182, 255), cv::Scalar(0, 0, 230), cv::Scalar(220, 20, 60), 
+        cv::Scalar(163, 255, 0), cv::Scalar(0, 82, 0), cv::Scalar(3, 95, 161), cv::Scalar(0, 80, 100), cv::Scalar(183, 130, 88)
     };
 };
 
diff --git a/object_detection/CMakeLists.txt b/object_detection/CMakeLists.txt
@@ -61,9 +61,6 @@ list(APPEND ALL_INCLUDE ${PROJECT_INCLUDE})
 
 include_directories(${ALL_INCLUDE})
 
-
-# add_subdirectory(nanodet)
-# add_subdirectory(Swin-Transformer)
 add_subdirectory(yolov5)
 # add_subdirectory(yolov6)
 add_subdirectory(yolov7)
diff --git a/src/basemodel.cpp b/src/basemodel.cpp
@@ -116,13 +116,7 @@ void Model::PreProcess(std::vector<cv::Mat>& img_batch) {
         cv::Mat d2s = cv::Mat::zeros(2, 3, CV_32FC1);
         cv::invertAffineTransform(s2d, d2s);
 
-        // memcpy(d2s.value, dst2src.ptr<float>(0), sizeof(d2s.value));
-        dst2src.v0 = d2s.ptr<float>(0)[0];
-        dst2src.v1 = d2s.ptr<float>(0)[1];
-        dst2src.v2 = d2s.ptr<float>(0)[2];
-        dst2src.v3 = d2s.ptr<float>(1)[0];
-        dst2src.v4 = d2s.ptr<float>(1)[1];
-        dst2src.v5 = d2s.ptr<float>(1)[2]; 
+        memcpy(&dst2src, d2s.ptr(), sizeof(dst2src));
         preprocess(img_batch[i].ptr(), dst2src, width, height, &gpu_buffers[0][bufferSize[0] * i], imageWidth, imageHeight, stream); 
         CUDA_CHECK(cudaStreamSynchronize(stream));
     }
diff --git a/src/build.cpp b/src/build.cpp
@@ -7,7 +7,7 @@ std::shared_ptr<Model> build_model(std::string model_arch, std::string cfg) {
         model = std::make_shared<YOLOv5>(root[model_arch]);
     else if (model_arch == "yolov5-seg")
         model = std::make_shared<YOLOv5_seg>(root[model_arch]);
-    // else if (model_arch == "YOLOv6")
+    // else if (model_arch == "yolov6")
     //     model = std::make_shared<YOLOv6>(root[model_arch]);
     else if (model_arch == "yolov7")
         model = std::make_shared<YOLOv7>(root[model_arch]);
diff --git a/src/rtdetr.cpp b/src/rtdetr.cpp
@@ -21,9 +21,9 @@ std::vector<Detections> RTDETR::InferenceImages(std::vector<cv::Mat> &imgBatch)
     auto boxes = PostProcess(imgBatch, gpu_buffers[1], gpu_buffers[2]);
     auto t_end_post = std::chrono::high_resolution_clock::now();
     float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
-    std::cout << "preprocess time: "<< total_pre << "ms " <<
-    "detection inference time: " << total_inf << "ms " 
-    "postprocess time: " << total_post << "ms " << std::endl; 
+    // std::cout << "preprocess time: "<< total_pre << "ms " <<
+    // "detection inference time: " << total_inf << "ms " 
+    // "postprocess time: " << total_post << "ms " << std::endl; 
     return boxes;
 }
 
@@ -38,8 +38,12 @@ std::vector<Detections> RTDETR::PostProcess(const std::vector<cv::Mat> &imgBatch
         float* box_per_img = output1 + index * predboxSize;
         float* score_per_img = output2 + index * predscoreSize;
         cuda_postprocess_init(6, imageWidth, imageHeight);
+        auto t_start_post = std::chrono::high_resolution_clock::now();
         rtdetr_postprocess_box(box_per_img, score_per_img, num_bboxes, num_classes, 6, 
                                conf_thr, imageWidth, imageHeight, dst2src, stream, cpu_buffers[2]);
+        auto t_end_post = std::chrono::high_resolution_clock::now();
+        float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count(); 
+        std::cout << "postprocess time: " << total_post << "ms " << std::endl;
         int num_boxes = std::min((int)cpu_buffers[2][0], 300);
         for (int i = 0; i < num_boxes; i++) {
             Box box;