• TensorRT


    1:.pth ------.onnx --------.engine

    利用的库是

    2: float32-float16-int8

    3:遇见显性,隐性batch可参考

    【TensorRT】execute_async VS execute_async_v2_context.execute_async_v2_昌山小屋的博客-CSDN博客

    Developer Guide :: NVIDIA Deep Learning TensorRT Documentation

    推理基本步骤可参考

    1:初始化:将模型解析,获取输入输出大小并开辟相应内存

    1. FODLINE::FODLINE(const char* modelPath)
    2. {
    3. int DEVICE;
    4. DEVICE = wether_GPU();
    5. size_t size{0};
    6. static Logger gLogger;
    7. char *trtModelStream{nullptr};
    8. const std::string engine_file_path {modelPath};
    9. cout << "path" << engine_file_path <<endl;
    10. std::ifstream file(engine_file_path, std::ios::binary);
    11. if (file.good()) {
    12. file.seekg(0, file.end);
    13. size = file.tellg();
    14. file.seekg(0, file.beg);
    15. trtModelStream = new char[size];
    16. assert(trtModelStream);
    17. file.read(trtModelStream, size);
    18. file.close();
    19. }
    20. IRuntime* runtime = createInferRuntime(gLogger);
    21. assert(runtime != nullptr);
    22. ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
    23. assert(engine != nullptr);
    24. context = engine->createExecutionContext();
    25. assert(context != nullptr);
    26. delete[] trtModelStream;
    27. int num = engine->getNbBindings();
    28. for (int i=1;i <num;i++){
    29. auto out_dims = engine->getBindingDimensions(i);
    30. auto output_size1 = 1;
    31. for(int j=0;j<out_dims.nbDims;j++) {
    32. output_size1 *= out_dims.d[j];
    33. }
    34. output_eng.push_back(new float[output_size1]);
    35. output_size.push_back(output_size1);
    36. }
    37. }

    2:将图片传入并将数据copy到cuda计算,这个里面5代表输入和输出的分支加起来,一般第一个就是input后面的都是输出分支,这个看onnx和转trt时的定义,当然也可以写一个for这样的话就不用像下面那样列了四个输出。,如果输出完成,那么后面的就是后处理的,根据代码进行调整就可以了。要是还不明白就看下gitULTRL-DETECT: c++实现车道线检测 - Gitee.com

    1. void FODLINE::doInference(IExecutionContext& context, float* input, vector<float *> output, vector<int>output_size, cv::Size input_shape) {
    2. const ICudaEngine& engine = context.getEngine();
    3. // Pointers to input and output device buffers to pass to engine.
    4. // Engine requires exactly IEngine::getNbBindings() number of buffers.
    5. assert(engine.getNbBindings() == 5);
    6. void* buffers[5];
    7. // In order to bind the buffers, we need to know the names of the input and output tensors.
    8. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    9. const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    10. assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
    11. const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    12. const int outputIndex1 = engine.getBindingIndex(OUTPUT_BLOB_NAME1);
    13. const int outputIndex2 = engine.getBindingIndex(OUTPUT_BLOB_NAME2);
    14. const int outputIndex3 = engine.getBindingIndex(OUTPUT_BLOB_NAME3);
    15. assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
    16. int mBatchSize = engine.getMaxBatchSize();
    17. // Create GPU buffers on device
    18. CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
    19. CHECK(cudaMalloc(&buffers[outputIndex], output_size[0]*sizeof(float)));
    20. CHECK(cudaMalloc(&buffers[outputIndex1], output_size[1]*sizeof(float)));
    21. CHECK(cudaMalloc(&buffers[outputIndex2], output_size[2]*sizeof(float)));
    22. CHECK(cudaMalloc(&buffers[outputIndex3], output_size[3]*sizeof(float)));
    23. // Create stream
    24. cudaStream_t stream;
    25. CHECK(cudaStreamCreate(&stream));
    26. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    27. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
    28. context.enqueue(1, buffers, stream, nullptr);
    29. CHECK(cudaMemcpyAsync(output[0], buffers[outputIndex], output_size[0]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    30. CHECK(cudaMemcpyAsync(output[1], buffers[outputIndex1], output_size[1]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    31. CHECK(cudaMemcpyAsync(output[2], buffers[outputIndex2], output_size[2]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    32. CHECK(cudaMemcpyAsync(output[3], buffers[outputIndex3], output_size[3]* sizeof(float), cudaMemcpyDeviceToHost, stream));
    33. cudaStreamSynchronize(stream);
    34. // Release stream and buffers
    35. cudaStreamDestroy(stream);
    36. CHECK(cudaFree(buffers[inputIndex]));
    37. CHECK(cudaFree(buffers[outputIndex]));
    38. CHECK(cudaFree(buffers[outputIndex1]));
    39. CHECK(cudaFree(buffers[outputIndex2]));
    40. CHECK(cudaFree(buffers[outputIndex3]));
    41. }

  • 相关阅读:
    JavaScript操作BOM和DOM
    SpringBoot+Mybaits搭建通用管理系统实例七:访问接口Api层功能实现
    MySQL 主从读写分离入门——基本原理以及ProxySQL的简单使用
    开发费用超出预算,如何提高估算准确性?
    一步教你怎么使用Vuex
    经典算法:不大于N的特殊数字
    单体120万连接,小爱网关如何架构?
    力扣-240.搜索二维矩阵(2)
    从标准输入流中读取并执行shell指定函数
    10.11 校招 实习 内推 面经
  • 原文地址:https://blog.csdn.net/qq_51609636/article/details/121850072