Surport yolonas

Li-Hongda · Li-Hongda · commit 269ae9c4573c · 2023-06-07T20:58:59.000+08:00
diff --git a/README.md b/README.md
@@ -37,6 +37,7 @@ This repo use TensorRT-8.x to deploy well-trained models, both image preprocessi
 + 2023.05.19 🚀 Support cuda mask postprocess and support rtdetr.
 + 2023.05.21 🚀 Support yolov6.
 + 2023.05.26 🚀 Support dynamic batch inference.
++ 2023.06.07 🚀 Support yolox and yolo-nas.
 </details>
 
 ## 3.Support Models
@@ -51,6 +52,7 @@ This repo use TensorRT-8.x to deploy well-trained models, both image preprocessi
 - [x] [YOLOv8-seg](https://github.com/ultralytics/ultralytics)<br>
 - [x] [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX)<br>
 - [x] [RT-DETR](https://github.com/PaddlePaddle/PaddleDetection/tree/develop/configs/rtdetr)<br>
+- [x] [YOLO-NAS](https://github.com/Deci-AI/super-gradients)<br>
 </details>
 
 All speed tests were performed on RTX 3090 with COCO Val set.The time calculated here is the sum of the time of image loading, preprocess, inference and postprocess, so it's going to be slower than what's reported in the paper.
@@ -65,6 +67,8 @@ All speed tests were performed on RTX 3090 with COCO Val set.The time calculated
 | YOLOv7  | 1 | FP32 | 640x640 | 107 |
 | YOLOv8-s  | 1 | FP32 | 640x640 | 171 |
 | YOLOv8-seg-s  | 1 | FP32 | 640x640 | 122 |
+| YOLOX-s  | 1 | FP32 | 640x640 | 156 |
+| YOLO-NAS-s  | 1 | FP32 | 640x640 | 165 |
 | RT-DETR  | 1 | FP32 | 640x640 | 106 |
 </div>
 
diff --git a/configs/yolonas.yaml b/configs/yolonas.yaml
@@ -0,0 +1,15 @@
+yolonas:
+    onnx_file:     "../weights/yolonas/yolonas-s.onnx"
+    engine_file:   "../weights/yolonas/yolonas-s.trt"
+    type:          "coco"
+    mode:          "fp32"
+    dynamic:       0
+    batchSize:     1
+    imageWidth:    640
+    imageHeight:   640
+    conf_thr:      0.25
+    nms_thr:       0.45
+    strides:       [8, 16, 32]
+    imgScale:      255
+    imgMean:       [ 0, 0, 0 ]
+    imgStd:        [ 1, 1, 1 ]
diff --git a/include/build.h b/include/build.h
@@ -7,6 +7,7 @@
 #include "yolov7.h"
 #include "yolov8.h"
 #include "yolox.h"
+#include "yolonas.h"
 #include "rtdetr.h"
 
 std::shared_ptr<Model> build_model(std::string model_arch, std::string cfg);
diff --git a/include/cuda_function.h b/include/cuda_function.h
@@ -35,6 +35,9 @@ void yolov8_postprocess_box(float* predict, int num_bboxes, int num_classes, int
 void rtdetr_postprocess_box(float* predict_box, float* predict_cls, int num_bboxes,  int num_classes, int num_out,
 							float conf_thr, int imageWidth, int imageHeight, AffineMatrix mat, cudaStream_t stream, float* dst);
 
+void yolonas_postprocess_box(float* predict, int num_bboxes, int num_classes, int num_out, float conf_thr,
+				float nms_thr, AffineMatrix mat, cudaStream_t stream, float* dst);
+
 void postprocess_box_mask(float* predict, int num_bboxes, int num_classes, int num_out, 
 						  float conf_thr, float nms_thr, AffineMatrix mat, cudaStream_t stream, float* dst);
 
diff --git a/include/yolonas.h b/include/yolonas.h
@@ -0,0 +1,12 @@
+#ifndef YOLONAS_H
+#define YOLONAS_H
+
+#include "yolov8.h"
+
+class YOLONAS : public YOLOv8 {
+public:
+    explicit YOLONAS(const YAML::Node &config);
+    std::vector<Detections> PostProcess(const std::vector<cv::Mat> &imgBatch, float* output);
+};
+
+#endif
diff --git a/object_detection/CMakeLists.txt b/object_detection/CMakeLists.txt
@@ -66,11 +66,12 @@ add_subdirectory(yolov6)
 add_subdirectory(yolov7)
 add_subdirectory(yolov8)
 add_subdirectory(yolox)
+add_subdirectory(yolonas)
 add_subdirectory(rtdetr)
 set(LIBRARY_OUTPUT_PATH ../libs)
 add_library(build SHARED ../src/build.cpp)
 
 set(EXECUTABLE_OUTPUT_PATH ../../bin)
 add_executable(object_detection main.cpp)
-target_link_libraries(object_detection yaml-cpp build yolov5 yolov6 yolov7 yolov8 yolox rtdetr cudart)
+target_link_libraries(object_detection yaml-cpp build yolov5 yolov6 yolov7 yolov8 yolox yolonas rtdetr cudart)
 
diff --git a/object_detection/yolonas/CMakeLists.txt b/object_detection/yolonas/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_BUILD_TYPE "Debug")
+# set(CMAKE_BUILD_TYPE "Release")
+
+project(yolonas)
+
+set(CMAKE_CXX_STANDARD 14)
+
+# YAML
+set(YAML_LIBRARY ../../yaml-cpp/build)
+set(YAML_INCLUDE ../../yaml-cpp/include)
+link_directories(${YAML_LIBRARY})
+list(APPEND ALL_INCLUDE ${YAML_INCLUDE})
+
+
+# Project
+set(PROJECT_INCLUDE  ../../include)
+message(STATUS "Find project include at ${PROJECT_INCLUDE}")
+list(APPEND ALL_INCLUDE ${PROJECT_INCLUDE})
+set(PROJECT_LIBRARY ../../src/common.cpp  
+                    ../../src/basemodel.cpp 
+                    ../../src/detection.cpp 
+                    ../../src/yolo.cpp 
+                    ../../src/yolonas.cpp
+                    ../../src/cuda_function.cu)
+
+include_directories(${ALL_INCLUDE})
+
+set(LIBRARY_OUTPUT_PATH ../../libs)
+add_library(yolonas SHARED ${PROJECT_LIBRARY} ${SAMPLES_LIBRARY})
+target_link_libraries(yolonas ${ALL_LIBS} yaml-cpp)
diff --git a/object_detection/yolonas/export.py b/object_detection/yolonas/export.py
@@ -0,0 +1,78 @@
+import argparse
+import onnx
+import onnxsim
+import torch
+import torch.nn as nn
+from super_gradients.training import models
+from super_gradients.common.object_names import Models
+
+
+
+class YOLONAS(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.model.eval()
+
+    def forward(self, input):
+
+        output = self.model(input)
+        return torch.cat(output, dim=-1)
+    
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default='yolo_nas_m', 
+                        choices=['yolo_nas_s','yolo_nas_m', 'yolo_nas_l'] , 
+                        help='model.pt')
+    parser.add_argument('--save-model', type=str, default='yolonas-m.onnx', 
+                        help='model.onnx')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], 
+                        help='image (h, w)')
+    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
+    parser.add_argument('--half', action='store_true', help='FP16 export')
+    parser.add_argument('--dynamic', action='store_true', help='dynamic axes')
+    parser.add_argument('--simplify', action='store_false', help='simplify model')
+    parser.add_argument('--opset', type=int, default=11, help='opset version')
+    args = parser.parse_args()
+    return args
+
+
+def main(model,
+         save_model, 
+         img_size,
+         batch_size,
+         opset = 11,
+         half = False,
+         dynamic = False,
+         simplify = True):
+    model = models.get(model, pretrained_weights="coco")
+    model.prep_model_for_conversion(input_size=[1, 3, 640, 640])
+
+    model = YOLONAS(model)
+    model.eval()
+    if dynamic:
+        dynamic = {'images': {0: 'batch', 2: 'height', 3: 'width'}, 
+                   'output0': {0: 'batch', 1: 'anchors'}}
+
+    img_size *= 2 if len(img_size) == 1 else 1
+    dummy_input = torch.zeros(batch_size, 3, *img_size)
+
+    torch.onnx.export(model, 
+                    dummy_input, 
+                    save_model, 
+                    input_names=['images'],
+                    output_names=['output0'],
+                    opset_version=opset, 
+                    do_constant_folding=True,
+                    dynamic_axes=dynamic or None)
+    model_onnx = onnx.load(save_model)
+    onnx.checker.check_model(model_onnx)
+    if simplify:
+        model_onnx, check = onnxsim.simplify(model_onnx)
+        assert check, 'simplify failed'
+        onnx.save(model_onnx, save_model)
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(**vars(args))
diff --git a/object_detection/yolonas/main.cpp b/object_detection/yolonas/main.cpp
@@ -0,0 +1,16 @@
+#include "../../include/yolonas.h"
+
+
+int main(int argc, char** argv) {
+    std::string inputpath = argv[1];
+    std::string cfg_dir = "../configs";
+    std::string cfg_suffix = ".yaml";
+    std::string savedir = "../results";
+    auto savepath = savedir + "/" + "yolonas" + "/";
+    auto cfg = cfg_dir + "/" + "yolonas" + cfg_suffix;    
+    YAML::Node root = YAML::LoadFile(cfg);
+    YOLONAS YOLONAS(root["yolonas"]);
+    YOLONAS.LoadEngine();
+    YOLONAS.Inference(inputpath, savepath);
+    return 0;
+}
diff --git a/src/basemodel.cpp b/src/basemodel.cpp
@@ -18,7 +18,7 @@ Model::Model(const YAML::Node &config) {
 
 Model::~Model() {
     cudaStreamDestroy(stream);
-    for (int i = 0; i < engine->getNbBindings(); i++){
+    for (int i = 0; i < engine->getNbBindings(); i++) {
         CUDA_CHECK(cudaFree(gpu_buffers[i]));
     }
 };
@@ -91,7 +91,7 @@ bool Model::ReadTrtFile() {
     // sample::gLogInfo << "deserialize done" << std::endl;
 }
 
-void Model::LoadEngine(){
+void Model::LoadEngine() {
     // create and load engine
     std::fstream existEngine;
     existEngine.open(engine_file, std::ios::in);
diff --git a/src/build.cpp b/src/build.cpp
@@ -17,6 +17,8 @@ std::shared_ptr<Model> build_model(std::string model_arch, std::string cfg) {
         model = std::make_shared<YOLOv8_seg>(root[model_arch]); 
     else if (model_arch == "yolox")
         model = std::make_shared<YOLOX>(root[model_arch]);
+    else if (model_arch == "yolonas")
+        model = std::make_shared<YOLONAS>(root[model_arch]);        
     else if (model_arch == "rtdetr")
         model = std::make_shared<RTDETR>(root[model_arch]);
     else
diff --git a/src/cuda_function.cu b/src/cuda_function.cu
@@ -254,13 +254,56 @@ static __global__ void yolov8_decode_box_kernel(float* predict, int num_bboxes,
     float* pout_item = parray + 1 + index * num_out;
     pout_item[0] = left;
     pout_item[1] = top;
-    pout_item[2] = width;
-    pout_item[3] = height;
+    pout_item[2] = right - left;
+    pout_item[3] = bottom - top;
     pout_item[4] = score;
     pout_item[5] = label;
 	pout_item[6] = 1;		
 }
 
+static __global__ void yolonas_decode_box_kernel(float* predict, int num_bboxes, int num_out,
+										 int num_classes, float conf_thr, AffineMatrix mat, 
+    									 float* parray) {
+
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= num_bboxes) return;
+
+    float* pred_per_obj = predict + position * (num_classes + 4);
+	
+    float* cls_score = pred_per_obj + 4;
+
+    float score = *cls_score++;
+
+    int label = 0;
+    for (int i = 1; i < num_classes; i++, ++cls_score) {
+        if (*cls_score > score) {   
+            score = *cls_score;
+            label = i;
+        }
+    }
+    if (score < conf_thr) return;
+    float l = pred_per_obj[0];
+    float t = pred_per_obj[1];
+    float r = pred_per_obj[2];
+    float b = pred_per_obj[3];
+    auto left = mat.v0 * l + mat.v1 * t + mat.v2;
+    auto right = mat.v0 * r + mat.v1 * b + mat.v2;
+    auto top = mat.v3 * l + mat.v4 * t + mat.v5;
+    auto bottom = mat.v3 * r + mat.v4 * b + mat.v5;  
+
+    int index = atomicAdd(parray, 1);
+    
+    float* pout_item = parray + 1 + index * num_out;
+    pout_item[0] = left;
+    pout_item[1] = top;
+    pout_item[2] = right - left;
+    pout_item[3] = bottom - top;
+    pout_item[4] = score;
+    pout_item[5] = label;
+	pout_item[6] = 1;		
+}
+
+
 
 static __global__ void rtdetr_decode_box_kernel(float* predict_box, float* predict_cls, int num_bboxes, 
                                                 int num_out, int num_classes, float conf_thr,
@@ -299,8 +342,8 @@ static __global__ void rtdetr_decode_box_kernel(float* predict_box, float* predi
     float* pout_item = parray + 1 + index * num_out;
     pout_item[0] = left;
     pout_item[1] = top;
-    pout_item[2] = width;
-    pout_item[3] = height;
+    pout_item[2] = right - left;
+    pout_item[3] = bottom - top;
     pout_item[4] = score;
     pout_item[5] = label;	
 }
@@ -346,8 +389,8 @@ static __global__ void decode_box_mask_kernel(float* predict, int num_bboxes, in
     float* pout_item = parray + 1 + index * num_out;
     pout_item[0] = left;
     pout_item[1] = top;
-    pout_item[2] = width;
-    pout_item[3] = height;
+    pout_item[2] = right - left;
+    pout_item[3] = bottom - top;
     pout_item[4] = score;
     pout_item[5] = label;
 	pout_item[6] = 1;	
@@ -395,8 +438,8 @@ static __global__ void yolov8_decode_box_mask_kernel(float* predict, int num_bbo
     float* pout_item = parray + 1 + index * num_out;
     pout_item[0] = left;
     pout_item[1] = top;
-    pout_item[2] = width;
-    pout_item[3] = height;
+    pout_item[2] = right - left;
+    pout_item[3] = bottom - top;
     pout_item[4] = score;
     pout_item[5] = label;
 	pout_item[6] = 1;	
@@ -486,6 +529,19 @@ void rtdetr_postprocess_box(float* predict_box, float* predict_cls, int num_bbox
 	CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
+void yolonas_postprocess_box(float* predict, int num_bboxes, int num_classes, int num_out,
+	float conf_thr, float nms_thr, AffineMatrix mat, cudaStream_t stream, float* dst) {
+    auto block = num_bboxes > 512 ? 512 : num_bboxes;
+    auto grid = (num_bboxes + block - 1) / block;
+    yolonas_decode_box_kernel<<<grid, block, 0, stream>>>(predict, num_bboxes, num_out, num_classes, 
+												  conf_thr, mat, out_buffer_device);
+    block = 512;
+    grid = (1000 + block - 1) / block;
+    fast_nms_kernel<<<grid, block, 0, stream>>>(out_buffer_device, nms_thr, num_out);
+	CUDA_CHECK(cudaMemcpyAsync(dst, out_buffer_device, sizeof(int) + 1000 * num_out * sizeof(float), cudaMemcpyDeviceToHost, stream));
+	CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
 void postprocess_box_mask(float* predict, int num_bboxes, int num_classes, 
 	int num_out, float conf_thr, float nms_thr, AffineMatrix mat, cudaStream_t stream, float* dst) {
     auto block = num_bboxes > 512 ? 512 : num_bboxes;
diff --git a/src/yolonas.cpp b/src/yolonas.cpp
@@ -0,0 +1,37 @@
+#include "yolonas.h"
+
+YOLONAS::YOLONAS(const YAML::Node &config) : YOLOv8(config) {
+    num_bboxes = 0;
+    for (const int &stride : strides) {
+        num_bboxes += int(imageHeight / stride) * int(imageWidth / stride);
+    }
+}
+
+std::vector<Detections> YOLONAS::PostProcess(const std::vector<cv::Mat> &imgBatch, float* output) {
+    std::vector<Detections> vec_result;
+    int index = 0;
+    auto predSize = bufferSize[1] / batchSize / sizeof(float);
+    for (const cv::Mat &img : imgBatch) {
+        Detections result;
+        float* pred_per_img = output + index * predSize;
+        cuda_postprocess_init(7, imageWidth, imageHeight);
+        yolonas_postprocess_box(pred_per_img, num_bboxes, num_classes, 7, conf_thr, nms_thr, dst2src[index], stream, cpu_buffer);
+        int num_boxes = std::min((int)cpu_buffer[0], 1000);
+        for (int i = 0; i < num_boxes; i++) {
+            Box box;
+            float* ptr = cpu_buffer + 1 + 7 * i;
+            if (!ptr[6]) continue;
+            box.x = ptr[0];
+            box.y = ptr[1];
+            box.w = ptr[2];
+            box.h = ptr[3];
+            box.score = ptr[4];
+            box.label = ptr[5];            
+            result.dets.emplace_back(box);
+        }
+        vec_result.emplace_back(result);
+        index++;
+    }
+    dst2src.clear();        
+    return vec_result;
+}