• tensorrt内存释放 笔记


    目录

    load_model char * 释放 

    cudaMallocHost float*释放

    void* buffers和 cudaStream_t 释放

    推理释放模型内存:


    load_model char * 释放 

    yolo 代码示例

    1. bool YOLO::load_model(std::string trt_path) {
    2. size_t size{ 0 };
    3. char *trtModelStream{ nullptr };
    4. std::ifstream file(trt_path, std::ios::binary);
    5. if (file.good()) {
    6. file.seekg(0, file.end);
    7. size = file.tellg();
    8. file.seekg(0, file.beg);
    9. trtModelStream = new char[size];
    10. assert(trtModelStream);
    11. file.read(trtModelStream, size);
    12. file.close();
    13. }
    14. std::cout << "engine init finished" << std::endl;
    15. runtime = createInferRuntime(gLogger);
    16. assert(runtime != nullptr);
    17. engine = runtime->deserializeCudaEngine(trtModelStream, size);
    18. assert(engine != nullptr);
    19. context = engine->createExecutionContext();
    20. assert(context != nullptr);
    21. delete[] trtModelStream;
    22. return true;
    23. }

    cudaMallocHost float*释放

    1. float* input_data_host = nullptr;
    2. //cudaMallocHost(&input_data_host, batch_size * sizeof(float));
    3. cudaMallocHost(&input_data_host, batch_size * 3 * this->INPUT_H * this->INPUT_W * sizeof(float));
    4. auto t_1 = std::chrono::high_resolution_clock::now();
    5. for (int i = 0; i < image_list.size(); i++) {
    6. cv::Mat img_o = image_list.at(i);
    7. cv::Mat img_raw = this->static_resize(img_o);
    8. int input_height = img_raw.rows;
    9. int input_width = img_raw.cols;
    10. int image_area = img_raw.cols * img_raw.rows;
    11. unsigned char* pimage = img_raw.data;
    12. float* phost_b = input_data_host + image_area * 0 + i * input_channel * input_height * input_width;
    13. float* phost_g = input_data_host + image_area * 1 + i * input_channel * input_height * input_width;
    14. float* phost_r = input_data_host + image_area * 2 + i * input_channel * input_height * input_width;
    15. for (int mm = 0; mm < image_area; ++mm, pimage += 3) {
    16. *phost_r++ = pimage[0] / 255.0f;// (pimage[0] / 255.0f - mean[0]) / std[0];
    17. *phost_g++ = pimage[1] / 255.0f;;// (pimage[1] / 255.0f - mean[1]) / std[1];
    18. *phost_b++ = pimage[2] / 255.0f;;//(pimage[2] / 255.0f - mean[2]) / std[2];
    19. }
    20. }
    21. ...
    22. checkRuntime(cudaFreeHost(input_data_host));

    void* buffers和 cudaStream_t 释放

    1. void* buffers[2];
    2. // In order to bind the buffers, we need to know the names of the input and output tensors.
    3. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    4. const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    5. assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
    6. const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
    7. assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
    8. int mBatchSize = engine.getMaxBatchSize();
    9. // Create GPU buffers on device
    10. CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
    11. CHECK(cudaMalloc(&buffers[outputIndex], output_size * sizeof(float)));
    12. // Create stream
    13. cudaStream_t stream;
    14. CHECK(cudaStreamCreate(&stream));
    15. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    16. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
    17. //context.enqueue(1, buffers, stream, nullptr);
    18. context->enqueueV2(buffers, stream, nullptr);
    19. CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
    20. cudaStreamSynchronize(stream);
    21. // Release stream and buffers
    22. cudaStreamDestroy(stream);
    23. CHECK(cudaFree(buffers[inputIndex]));
    24. CHECK(cudaFree(buffers[outputIndex]));

    推理释放模型内存:

    这个报错:

    1. context->destroy();
    2. engine->destroy();
    3. runtime->destroy();

    后来发现把engine->destroy();注释掉,就不报错了。

    原因未知。

  • 相关阅读:
    【VUE项目实战】55、商品添加功能(五)-商品内容模块
    java第二十六课 —— java动态绑定机制 | 多态的应用(一)
    (附源码)计算机毕业设计Java坝上长尾鸡养殖管理系统
    C++中变量是按值访问的, Python 中变量的值是按引用访问的示例说明
    stream().sorted()以及java中常用的比较器
    tf.compat.v1.estimator.tpu.TPUEstimator参数说明
    WebGL编程指南-23 光照原理、漫反射光计算、漫反射光照射下的立方体
    linux安装vsftp
    【竞赛题目】木块(C语言详解)
    SQL自连接,内连接,区别
  • 原文地址:https://blog.csdn.net/jacke121/article/details/127836343