目录
void* buffers和 cudaStream_t 释放
yolo 代码示例
- bool YOLO::load_model(std::string trt_path) {
-
- size_t size{ 0 };
- char *trtModelStream{ nullptr };
- std::ifstream file(trt_path, std::ios::binary);
- if (file.good()) {
- file.seekg(0, file.end);
- size = file.tellg();
- file.seekg(0, file.beg);
- trtModelStream = new char[size];
- assert(trtModelStream);
- file.read(trtModelStream, size);
- file.close();
- }
- std::cout << "engine init finished" << std::endl;
-
- runtime = createInferRuntime(gLogger);
- assert(runtime != nullptr);
- engine = runtime->deserializeCudaEngine(trtModelStream, size);
- assert(engine != nullptr);
- context = engine->createExecutionContext();
- assert(context != nullptr);
- delete[] trtModelStream;
-
- return true;
- }
- float* input_data_host = nullptr;
- //cudaMallocHost(&input_data_host, batch_size * sizeof(float));
- cudaMallocHost(&input_data_host, batch_size * 3 * this->INPUT_H * this->INPUT_W * sizeof(float));
- auto t_1 = std::chrono::high_resolution_clock::now();
- for (int i = 0; i < image_list.size(); i++) {
- cv::Mat img_o = image_list.at(i);
-
- cv::Mat img_raw = this->static_resize(img_o);
-
- int input_height = img_raw.rows;
- int input_width = img_raw.cols;
-
- int image_area = img_raw.cols * img_raw.rows;
- unsigned char* pimage = img_raw.data;
- float* phost_b = input_data_host + image_area * 0 + i * input_channel * input_height * input_width;
- float* phost_g = input_data_host + image_area * 1 + i * input_channel * input_height * input_width;
- float* phost_r = input_data_host + image_area * 2 + i * input_channel * input_height * input_width;
- for (int mm = 0; mm < image_area; ++mm, pimage += 3) {
- *phost_r++ = pimage[0] / 255.0f;// (pimage[0] / 255.0f - mean[0]) / std[0];
- *phost_g++ = pimage[1] / 255.0f;;// (pimage[1] / 255.0f - mean[1]) / std[1];
- *phost_b++ = pimage[2] / 255.0f;;//(pimage[2] / 255.0f - mean[2]) / std[2];
- }
- }
-
-
- ...
- checkRuntime(cudaFreeHost(input_data_host));
- void* buffers[2];
-
- // In order to bind the buffers, we need to know the names of the input and output tensors.
- // Note that indices are guaranteed to be less than IEngine::getNbBindings()
- const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
-
- assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
- const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
- assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
- int mBatchSize = engine.getMaxBatchSize();
-
- // Create GPU buffers on device
- CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
- CHECK(cudaMalloc(&buffers[outputIndex], output_size * sizeof(float)));
-
- // Create stream
- cudaStream_t stream;
- CHECK(cudaStreamCreate(&stream));
-
- // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
- CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
- //context.enqueue(1, buffers, stream, nullptr);
-
- context->enqueueV2(buffers, stream, nullptr);
- CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
- cudaStreamSynchronize(stream);
-
- // Release stream and buffers
- cudaStreamDestroy(stream);
- CHECK(cudaFree(buffers[inputIndex]));
- CHECK(cudaFree(buffers[outputIndex]));
这个报错:
- context->destroy();
- engine->destroy();
- runtime->destroy();
后来发现把engine->destroy();注释掉,就不报错了。
原因未知。