set default FP16 and update GELAN

wang-xinyu · Apr 23, 2024 · fdd1fc2 · fdd1fc2
1 parent d033a63
commit fdd1fc2
Show file tree

Hide file tree

Showing 7 changed files with 387 additions and 11 deletions.
diff --git a/yolov9/README.md b/yolov9/README.md
@@ -8,13 +8,21 @@ The Pytorch implementation is [WongKinYiu/yolov9](https://github.com/WongKinYiu/
 
 ## Progress
 - [x] YOLOv9-c:
-    - [x] FP32
-    - [x] FP16
-    - [x] INT8
+  - [x] FP32
+  - [x] FP16
+  - [x] INT8
 - [x] YOLOv9-e:
-    - [x] FP32
-    - [x] FP16
-    - [x] INT8
+  - [x] FP32
+  - [x] FP16
+  - [x] INT8
+- [x] GELAN-c:
+  - [x] FP32
+  - [x] FP16
+  - [x] INT8
+- [x] GELAN-e:
+  - [x] FP32
+  - [x] FP16
+  - [x] INT8
 
 ## Requirements
 
@@ -32,7 +40,10 @@ The speed test is done on a desktop with R7-5700G CPU and RTX 4060Ti GPU. The in
 | tensorrt | YOLOv9-c | 13.5ms | 4.6ms | 3.0ms |
 | tensorrt | YOLOv9-e | 8.3ms | 3.2ms | 2.15ms |
 
+**GELAN will be updated later.**
+
 YOLOv9-e is faster than YOLOv9-c in tensorrt, because the YOLOv9-e requires fewer layers of inference.
+
 ```
 YOLOv9-c:
 [[31, 34, 37, 16, 19, 22], 1, DualDDetect, [nc]] # [A3, A4, A5, P3, P4, P5]

diff --git a/yolov9/demo.cpp b/yolov9/demo.cpp
@@ -25,7 +25,12 @@ void serialize_engine(unsigned int max_batchsize, std::string& wts_name, std::st
         serialized_engine = build_engine_yolov9_e(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
     } else if (sub_type == "c") {
         serialized_engine = build_engine_yolov9_c(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
-    } else {
+    } else if (sub_type == "ge") {
+        serialized_engine = build_engine_gelan_e(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
+    } else if (sub_type == "gc") {
+        serialized_engine = build_engine_gelan_c(max_batchsize, builder, config, DataType::kFLOAT, wts_name);
+    }
+    else {
         return;
     }
     assert(serialized_engine != nullptr);
@@ -113,15 +118,15 @@ int main(int argc, char** argv) {
 
     std::string wts_name = "";
     std::string engine_name = "";
-    std::string img_dir;
+    std::string img_dir = "";
     std::string sub_type = "";
     // speed test or inference
     // const int speed_test_iter = 1000;
     const int speed_test_iter = 1;
 
     if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type)) {
         std::cerr << "Arguments not right!" << std::endl;
-        std::cerr << "./yolov9 -s [.wts] [.engine] [c/e]  // serialize model to plan file" << std::endl;
+        std::cerr << "./yolov9 -s [.wts] [.engine] [c/e/gc/ge]  // serialize model to plan file" << std::endl;
         std::cerr << "./yolov9 -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
         return -1;
     }

diff --git a/yolov9/include/block.h b/yolov9/include/block.h
@@ -42,3 +42,6 @@ nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map<std
 nvinfer1::ILayer* convBnNoAct(nvinfer1::INetworkDefinition* network,
                               std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int ch,
                               int k, int s, int p, std::string lname, int g);
+std::vector<IConcatenationLayer*> DDetect(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
+                                           std::vector<ILayer*> dets, int cls, std::vector<int> ch,
+                                           std::string lname);
diff --git a/yolov9/include/config.h b/yolov9/include/config.h
@@ -7,7 +7,7 @@
 
 // For INT8, you need prepare the calibration dataset, please refer to
 // https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5#int8-quantization
-#define USE_INT8  // set USE_INT8 or USE_FP16 or USE_FP32
+#define USE_FP16  // set USE_INT8 or USE_FP16 or USE_FP32
 
 #ifdef USE_INT8
 const static char* gCalibTablePath = "./calib";

diff --git a/yolov9/include/model.h b/yolov9/include/model.h
@@ -3,4 +3,6 @@
 #include <NvInfer.h>
 #include <string>
 nvinfer1::IHostMemory* build_engine_yolov9_e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);
-nvinfer1::IHostMemory* build_engine_yolov9_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);
+nvinfer1::IHostMemory* build_engine_yolov9_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);
+nvinfer1::IHostMemory* build_engine_gelan_e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);
+nvinfer1::IHostMemory* build_engine_gelan_c(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);
diff --git a/yolov9/src/block.cpp b/yolov9/src/block.cpp
@@ -423,3 +423,36 @@ std::vector<IConcatenationLayer*> DualDDetect(INetworkDefinition* network, std::
     }
     return ret;
 }
+
+std::vector<IConcatenationLayer*> DDetect(INetworkDefinition* network, std::map<std::string, Weights>& weightMap,
+                                              std::vector<ILayer*> dets, int cls, std::vector<int> ch,
+                                              std::string lname) {
+    int c2 = std::max(int(ch[0] / 4), int(16 * 4));
+    int c3 = std::max(ch[0], std::min(cls * 2, 128));
+    int reg_max = 16;
+
+    std::vector<ILayer*> bboxlayers;
+    std::vector<ILayer*> clslayers;
+
+    for (int i = 0; i < dets.size(); i++) {
+        // Conv(x, c2, 3), Conv(c2, c2, 3, g=4), nn.Conv2d(c2, 4 * self.reg_max, 1, groups=4)
+        bboxlayers.push_back(DetectBbox_Conv(network, weightMap, *dets[i]->getOutput(0), c2, reg_max,
+                                             lname + ".cv2." + std::to_string(i)));
+        // Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, self.nc, 1)
+        auto cls_layer = DetectCls_Conv(network, weightMap, *dets[i]->getOutput(0), c3, cls,
+                                        lname + ".cv3." + std::to_string(i));
+        auto dim = cls_layer->getOutput(0)->getDimensions();
+        nvinfer1::IShuffleLayer* shuffle = network->addShuffle(*cls_layer->getOutput(0));
+        shuffle->setReshapeDimensions(nvinfer1::Dims2{kNumClass, dim.d[1] * dim.d[2]});
+        clslayers.push_back(shuffle);
+    }
+
+    std::vector<IConcatenationLayer*> ret;
+    for (int i = 0; i < dets.size(); i++) {
+        // softmax 16*4, w, h => 16, 4, w, h
+        auto loc = DFL(network, weightMap, *bboxlayers[i]->getOutput(0), 16, 1, 1, 0, lname + ".dfl");
+        nvinfer1::ITensor* inputTensor[] = {loc->getOutput(0), clslayers[i]->getOutput(0)};
+        ret.push_back(network->addConcatenation(inputTensor, 2));
+    }
+    return ret;
+}