Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trt10 #1554

Merged
merged 13 commits into from
Jul 12, 2024
8 changes: 8 additions & 0 deletions yolov5/yolov5_trt10/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ include_directories(${PROJECT_SOURCE_DIR}/plugin/)
file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)

add_definitions(-DAPI_EXPORTS)
add_library(myplugins SHARED ${PLUGIN_SRCS})
target_link_libraries(myplugins nvinfer cudart)

Expand All @@ -36,4 +37,11 @@ include_directories(${OpenCV_INCLUDE_DIRS})
add_executable(yolov5_cls yolov5_cls.cpp ${SRCS})
target_link_libraries(yolov5_cls nvinfer)
target_link_libraries(yolov5_cls cudart)
target_link_libraries(yolov5_cls myplugins)
target_link_libraries(yolov5_cls ${OpenCV_LIBS})

add_executable(yolov5_det yolov5_det.cpp ${SRCS})
target_link_libraries(yolov5_det nvinfer)
target_link_libraries(yolov5_det cudart)
target_link_libraries(yolov5_det myplugins)
target_link_libraries(yolov5_det ${OpenCV_LIBS})
31 changes: 29 additions & 2 deletions yolov5/yolov5_trt10/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ TensorRT: TensorRT-10.2.0.19
## Support

* [x] YOLOv5-cls support FP32/FP16/INT8 and Python/C++ API
* [x] YOLOv5-det support FP32/FP16/INT8 and Python/C++ API

## Config

Expand All @@ -25,8 +26,10 @@ TensorRT: TensorRT-10.2.0.19
git clone -b v7.0 https://github.com/ultralytics/yolov5.git
git clone -b trt10 https://github.com/wang-xinyu/tensorrtx.git
cd yolov5/
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n-cls.pt
wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5n.pt
cp [PATH-TO-TENSORRTX]/yolov5/gen_wts.py .
python gen_wts.py -w yolov5n-cls.pt -o yolov5n-cls.wts -t cls
python gen_wts.py -w yolov5n.pt -o yolov5n.wts
# A file 'yolov5n.wts' will be generated.
```
Expand All @@ -40,7 +43,7 @@ cd [PATH-TO-TENSORRTX]/yolov5/yolov5_trt10
# Update kNumClass in src/config.h if your model is trained on custom dataset
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5s.wts .
cp [PATH-TO-ultralytics-yolov5]/yolov5sn-cls.wts .
cmake ..
make

Expand All @@ -60,6 +63,30 @@ wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/i
// Install python-tensorrt, pycuda, etc.
// Ensure the yolov5n-cls.engine
python yolov5_cls_trt.py
# faq: in windows bug pycuda._driver.LogicError
# faq: in linux bug Segmentation fault
# Add the following code to the py file:
# import pycuda.autoinit
# import pycuda.driver as cuda
```

#### Detection

```shell
cd [PATH-TO-TENSORRTX]/yolov5/yolov5_trt10
# Update kNumClass in src/config.h if your model is trained on custom dataset
mkdir build
cd build
cp [PATH-TO-ultralytics-yolov5]/yolov5n.wts .
cmake ..
make

# Build and serialize TensorRT engine
./yolov5_det -s yolov5n.wts yolov5n.engine [n/s/m/l/x]

# Run inference
./yolov5_det -d yolov5n.engine ../../images
# The results are displayed in the console
```

## INT8 Quantization
Expand All @@ -69,4 +96,4 @@ python yolov5_cls_trt.py
4. serialize the model and test

## More Information
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
4 changes: 4 additions & 0 deletions yolov5/yolov5_trt10/plugin/yololayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ class API YoloLayerPlugin : public IPluginV2IOExt {
public:
YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, bool is_segmentation,
const std::vector<YoloKernel>& vYoloKernel);

YoloLayerPlugin(const void* data, size_t length);

~YoloLayerPlugin();

int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
Expand Down Expand Up @@ -66,6 +68,7 @@ class API YoloLayerPlugin : public IPluginV2IOExt {

private:
void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize = 1);

int mThreadCount = 256;
const char* mPluginNamespace;
int mKernelCount;
Expand Down Expand Up @@ -104,5 +107,6 @@ class API YoloPluginCreator : public IPluginCreator {
static PluginFieldCollection mFC;
static std::vector<PluginField> mPluginAttributes;
};

REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
}; // namespace nvinfer1
281 changes: 281 additions & 0 deletions yolov5/yolov5_trt10/src/model.cpp

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions yolov5/yolov5_trt10/src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,11 @@
nvinfer1::IHostMemory* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw,
std::string& wts_name);

nvinfer1::IHostMemory* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw,
std::string& wts_name);

nvinfer1::IHostMemory* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd,
float& gw, std::string& wts_name);
1 change: 1 addition & 0 deletions yolov5/yolov5_trt10/yolov5_cls_trt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import cv2
import numpy as np
import torch
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt

Expand Down
259 changes: 259 additions & 0 deletions yolov5/yolov5_trt10/yolov5_det.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
#include "cuda_utils.h"
#include "logging.h"
#include "model.h"
#include "postprocess.h"
#include "preprocess.h"
#include "utils.h"

#include <chrono>
#include <cmath>
#include <iostream>

using namespace nvinfer1;

static Logger gLogger;
const static int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;

bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, bool& is_p6, float& gd, float& gw,
std::string& img_dir) {
if (argc < 4)
return false;
if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
wts = std::string(argv[2]);
engine = std::string(argv[3]);
auto net = std::string(argv[4]);
if (net[0] == 'n') {
gd = 0.33;
gw = 0.25;
} else if (net[0] == 's') {
gd = 0.33;
gw = 0.50;
} else if (net[0] == 'm') {
gd = 0.67;
gw = 0.75;
} else if (net[0] == 'l') {
gd = 1.0;
gw = 1.0;
} else if (net[0] == 'x') {
gd = 1.33;
gw = 1.25;
} else if (net[0] == 'c' && argc == 7) {
gd = atof(argv[5]);
gw = atof(argv[6]);
} else {
return false;
}
if (net.size() == 2 && net[1] == '6') {
is_p6 = true;
}
} else if (std::string(argv[1]) == "-d" && argc == 4) {
engine = std::string(argv[2]);
img_dir = std::string(argv[3]);
} else {
return false;
}
return true;
}

void prepare_buffers(ICudaEngine* engine, float** gpu_input_buffer, float** gpu_output_buffer,
float** cpu_output_buffer) {
assert(engine->getNbIOTensors() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName);
if (input_mode != TensorIOMode::kINPUT) {
std::cerr << kInputTensorName << " should be input tensor" << std::endl;
assert(false);
}
TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName);
if (output_mode != TensorIOMode::kOUTPUT) {
std::cerr << kOutputTensorName << " should be output tensor" << std::endl;
assert(false);
}
// Create GPU buffers on device
CUDA_CHECK(cudaMalloc((void**)gpu_input_buffer, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));

*cpu_output_buffer = new float[kBatchSize * kOutputSize];
}

void infer(IExecutionContext& context, cudaStream_t& stream, void** gpu_buffers, float* output, int batchsize) {
context.setInputTensorAddress(kInputTensorName, gpu_buffers[0]);
context.setOutputTensorAddress(kOutputTensorName, gpu_buffers[1]);
context.enqueueV3(stream);
CUDA_CHECK(cudaMemcpyAsync(output, gpu_buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
stream));
cudaStreamSynchronize(stream);
}

void serialize_engine(unsigned int max_batchsize, bool& is_p6, float& gd, float& gw, std::string& wts_name,
std::string& engine_name) {
// Create builder
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();

// Create model to populate the network, then set the outputs and create an engine
IHostMemory* serialized_engine = nullptr;
if (is_p6) {
serialized_engine = build_det_p6_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
} else {
serialized_engine = build_det_engine(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
}
assert(serialized_engine != nullptr);

// Serialize the engine
assert(serialized_engine != nullptr);

// Save engine to file
std::ofstream p(engine_name, std::ios::binary);
if (!p) {
std::cerr << "Could not open plan output file" << std::endl;
assert(false);
}
p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());

// Close everything down
delete serialized_engine;
delete config;
delete builder;
}

void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
IExecutionContext** context) {
std::ifstream file(engine_name, std::ios::binary);
if (!file.good()) {
std::cerr << "read " << engine_name << " error!" << std::endl;
assert(false);
}
size_t size = 0;
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
char* serialized_engine = new char[size];
assert(serialized_engine);
file.read(serialized_engine, size);
file.close();

*runtime = createInferRuntime(gLogger);
assert(*runtime);
*engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
assert(*engine);
*context = (*engine)->createExecutionContext();
assert(*context);
delete[] serialized_engine;
}

int main(int argc, char** argv) {
// -s ../models/yolov5n.wts ../models/yolov5n.fp32.trt n
// -d ../models/yolov5n.fp32.trt ../images
cudaSetDevice(kGpuId);

std::string wts_name = "";
std::string engine_name = "";
bool is_p6 = false;
float gd = 0.0f, gw = 0.0f;
std::string img_dir;

if (!parse_args(argc, argv, wts_name, engine_name, is_p6, gd, gw, img_dir)) {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "./yolov5_det -s [.wts] [.engine] [n/s/m/l/x/n6/s6/m6/l6/x6 or c/c6 gd gw] // serialize model to "
"plan file"
<< std::endl;
std::cerr << "./yolov5_det -d [.engine] ../images // deserialize plan file and run inference" << std::endl;
return -1;
}

// Create a model using the API directly and serialize it to a file
if (!wts_name.empty()) {
serialize_engine(kBatchSize, is_p6, gd, gw, wts_name, engine_name);
return 0;
}

// Deserialize the engine from file
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
IExecutionContext* context = nullptr;
deserialize_engine(engine_name, &runtime, &engine, &context);
cudaStream_t stream;
CUDA_CHECK(cudaStreamCreate(&stream));

// Init CUDA preprocessing
cuda_preprocess_init(kMaxInputImageSize);

// Prepare cpu and gpu buffers
float* gpu_buffers[2];
float* cpu_output_buffer = nullptr;
prepare_buffers(engine, &gpu_buffers[0], &gpu_buffers[1], &cpu_output_buffer);

// Read images from directory
std::vector<std::string> file_names;
if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
std::cerr << "read_files_in_dir failed." << std::endl;
return -1;
}

// batch predict
for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
// Get a batch of images
std::vector<cv::Mat> img_batch;
std::vector<std::string> img_name_batch;
for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
img_batch.push_back(img);
img_name_batch.push_back(file_names[j]);
}

// Preprocess
cuda_batch_preprocess(img_batch, gpu_buffers[0], kInputW, kInputH, stream);

// Run inference
auto start = std::chrono::system_clock::now();
infer(*context, stream, (void**)gpu_buffers, cpu_output_buffer, kBatchSize);
auto end = std::chrono::system_clock::now();
std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "ms" << std::endl;

// NMS
std::vector<std::vector<Detection>> res_batch;
batch_nms(res_batch, cpu_output_buffer, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);

// print results
for (size_t j = 0; j < res_batch.size(); j++) {
for (size_t k = 0; k < res_batch[j].size(); k++) {
std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", "
<< res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", "
<< res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf
<< ", class_id: " << res_batch[j][k].class_id << std::endl;
}
}

// Draw bounding boxes
draw_bbox(img_batch, res_batch);

// Save images
for (size_t j = 0; j < img_batch.size(); j++) {
cv::imwrite("_" + img_name_batch[j], img_batch[j]);
}
}

// Release stream and buffers
cudaStreamDestroy(stream);
CUDA_CHECK(cudaFree(gpu_buffers[0]));
CUDA_CHECK(cudaFree(gpu_buffers[1]));
delete[] cpu_output_buffer;
cuda_preprocess_destroy();
// Destroy the engine
delete context;
delete engine;
delete runtime;

// Print histogram of the output distribution
// std::cout << "\nOutput:\n\n";
// for (unsigned int i = 0; i < kOutputSize; i++) {
// std::cout << prob[i] << ", ";
// if (i % 10 == 0) std::cout << std::endl;
// }
// std::cout << std::endl;

return 0;
}
Loading
Loading