• tensorrt的安装和使用


    安装

    提前安装好 CUDA 和 CUDNN,登录 NVIDIA 官方网站下载和主机 CUDA 版本适配的 TensorRT 压缩包即可。

    CUDA 版本是 10.2 为例,选择适配 CUDA 10.2 的 tensorrt的tar包,然后执行类似如下的命令安装并测试:

    1. #安装c++版本
    2. cd /the/path/of/tensorrt/tar/gz/file
    3. tar -zxvf TensorRT-8.2.5.1.linux.x86_64-gnu.cuda-10.2.cudnn8.2.tar.gz
    4. export TENSORRT_DIR=$(pwd)/TensorRT-8.2.5.1
    5. export LD_LIBRARY_PATH=$TENSORRT_DIR/lib:$LD_LIBRARY_PATH
    6. #安装python版本
    7. pip install TensorRT-8.2.5.1/python/tensorrt-8.2.5.1-cp37-none-linux_x86_64.whl
    8. python -c "import tensorrt;print(tensorrt.__version__)" #打印8.2.5.1,则说明安装成功
    9. #pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt==9.2.0.post12.dev也可以

    构建trt模型

    手动搭建

    使用python接口

    1. import tensorrt as trt
    2. verbose = True
    3. IN_NAME = 'input'
    4. OUT_NAME = 'output'
    5. IN_H = 224
    6. IN_W = 224
    7. BATCH_SIZE = 1
    8. EXPLICIT_BATCH = 1 << (int)(
    9. trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    10. TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger()
    11. with trt.Builder(TRT_LOGGER) as builder, builder.create_builder_config(
    12. ) as config, builder.create_network(EXPLICIT_BATCH) as network:
    13. # define network
    14. input_tensor = network.add_input(
    15. name=IN_NAME, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H, IN_W))
    16. pool = network.add_pooling(
    17. input=input_tensor, type=trt.PoolingType.MAX, window_size=(2, 2))
    18. pool.stride = (2, 2)
    19. pool.get_output(0).name = OUT_NAME
    20. network.mark_output(pool.get_output(0))
    21. # serialize the model to engine file
    22. profile = builder.create_optimization_profile()
    23. profile.set_shape_input('input', *[[BATCH_SIZE, 3, IN_H, IN_W]]*3)
    24. builder.max_batch_size = 1
    25. config.max_workspace_size = 1 << 30
    26. engine = builder.build_engine(network, config)
    27. with open('model_python_trt.engine', mode='wb') as f:
    28. f.write(bytearray(engine.serialize()))
    29. print("generating file done!")

    使用c++接口

    1. #include
    2. #include
    3. #include
    4. #include <../samples/common/logger.h>
    5. using namespace nvinfer1;
    6. using namespace sample;
    7. const char* IN_NAME = "input";
    8. const char* OUT_NAME = "output";
    9. static const int IN_H = 224;
    10. static const int IN_W = 224;
    11. static const int BATCH_SIZE = 1;
    12. static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    13. int main(int argc, char** argv)
    14. {
    15. // Create builder
    16. Logger m_logger;
    17. IBuilder* builder = createInferBuilder(m_logger);
    18. IBuilderConfig* config = builder->createBuilderConfig();
    19. // Create model to populate the network
    20. INetworkDefinition* network = builder->createNetworkV2(EXPLICIT_BATCH);
    21. ITensor* input_tensor = network->addInput(IN_NAME, DataType::kFLOAT, Dims4{ BATCH_SIZE, 3, IN_H, IN_W });
    22. IPoolingLayer* pool = network->addPoolingNd(*input_tensor, PoolingType::kMAX, DimsHW{ 2, 2 });
    23. pool->setStrideNd(DimsHW{ 2, 2 });
    24. pool->getOutput(0)->setName(OUT_NAME);
    25. network->markOutput(*pool->getOutput(0));
    26. // Build engine
    27. IOptimizationProfile* profile = builder->createOptimizationProfile();
    28. profile->setDimensions(IN_NAME, OptProfileSelector::kMIN, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
    29. profile->setDimensions(IN_NAME, OptProfileSelector::kOPT, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
    30. profile->setDimensions(IN_NAME, OptProfileSelector::kMAX, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
    31. config->setMaxWorkspaceSize(1 << 20);
    32. ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    33. // Serialize the model to engine file
    34. IHostMemory* modelStream{ nullptr };
    35. assert(engine != nullptr);
    36. modelStream = engine->serialize();
    37. std::ofstream p("model.engine", std::ios::binary);
    38. if (!p) {
    39. std::cerr << "could not open output file to save model" << std::endl;
    40. return -1;
    41. }
    42. p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    43. std::cout << "generating file done!" << std::endl;
    44. // Release resources
    45. modelStream->destroy();
    46. network->destroy();
    47. engine->destroy();
    48. builder->destroy();
    49. config->destroy();
    50. return 0;
    51. }

    onnx模型转换

    trtexec

    失败尝试:下载tensorrt的官方镜像,试图使用镜像中的trtexec,但是在驱动版本为515.105的A800机器上,不论哪个版本的镜像,都会报cuda版本跟驱动版本不匹配的错误

    1. docker pull nvcr.io/nvidia/tensorrt:xx.xx-py3
    2. #第一个xx表示年份,第二个xx表示月份,如22.09,23.01

    成功尝试:直接下载tensorrt的tar包,然后在bin目录下执行trtexec命令,好像是不依赖cuda和cudnn的,但是同样要注意,不同版本的tensorrt编译出来的engine文件,在别的版本tensorrt下可能不能运行。

    使用python接口

    1. import torch
    2. import onnx
    3. import tensorrt as trt
    4. onnx_model = 'model.onnx'
    5. class NaiveModel(torch.nn.Module):
    6. def __init__(self):
    7. super().__init__()
    8. self.pool = torch.nn.MaxPool2d(2, 2)
    9. def forward(self, x):
    10. return self.pool(x)
    11. device = torch.device('cuda:0')
    12. # generate ONNX model
    13. torch.onnx.export(NaiveModel(), torch.randn(1, 3, 224, 224), onnx_model, input_names=['input'], output_names=['output'], opset_version=11)
    14. onnx_model = onnx.load(onnx_model)
    15. # create builder and network
    16. logger = trt.Logger(trt.Logger.ERROR)
    17. builder = trt.Builder(logger)
    18. EXPLICIT_BATCH = 1 << (int)(
    19. trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    20. network = builder.create_network(EXPLICIT_BATCH)
    21. # parse onnx
    22. parser = trt.OnnxParser(network, logger)
    23. if not parser.parse(onnx_model.SerializeToString()):
    24. error_msgs = ''
    25. for error in range(parser.num_errors):
    26. error_msgs += f'{parser.get_error(error)}\n'
    27. raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
    28. config = builder.create_builder_config()
    29. config.max_workspace_size = 1<<20
    30. profile = builder.create_optimization_profile()
    31. profile.set_shape('input', [1,3 ,224 ,224], [1,3,224, 224], [1,3 ,224 ,224])
    32. config.add_optimization_profile(profile)
    33. # create engine
    34. with torch.cuda.device(device):
    35. engine = builder.build_engine(network, config)
    36. with open('model.engine', mode='wb') as f:
    37. f.write(bytearray(engine.serialize()))
    38. print("generating file done!")

    使用c++接口

    1. #include
    2. #include
    3. #include
    4. #include
    5. #include <../samples/common/logger.h>
    6. using namespace nvinfer1;
    7. using namespace nvonnxparser;
    8. using namespace sample;
    9. int main(int argc, char** argv)
    10. {
    11. // Create builder
    12. Logger m_logger;
    13. IBuilder* builder = createInferBuilder(m_logger);
    14. const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    15. IBuilderConfig* config = builder->createBuilderConfig();
    16. // Create model to populate the network
    17. INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
    18. // Parse ONNX file
    19. IParser* parser = nvonnxparser::createParser(*network, m_logger);
    20. bool parser_status = parser->parseFromFile("model.onnx", static_cast<int>(ILogger::Severity::kWARNING));
    21. // Get the name of network input
    22. Dims dim = network->getInput(0)->getDimensions();
    23. if (dim.d[0] == -1) // -1 means it is a dynamic model
    24. {
    25. const char* name = network->getInput(0)->getName();
    26. IOptimizationProfile* profile = builder->createOptimizationProfile();
    27. profile->setDimensions(name, OptProfileSelector::kMIN, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
    28. profile->setDimensions(name, OptProfileSelector::kOPT, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
    29. profile->setDimensions(name, OptProfileSelector::kMAX, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
    30. config->addOptimizationProfile(profile);
    31. }
    32. // Build engine
    33. config->setMaxWorkspaceSize(1 << 20);
    34. ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    35. // Serialize the model to engine file
    36. IHostMemory* modelStream{ nullptr };
    37. assert(engine != nullptr);
    38. modelStream = engine->serialize();
    39. std::ofstream p("model.engine", std::ios::binary);
    40. if (!p) {
    41. std::cerr << "could not open output file to save model" << std::endl;
    42. return -1;
    43. }
    44. p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    45. std::cout << "generate file success!" << std::endl;
    46. // Release resources
    47. modelStream->destroy();
    48. network->destroy();
    49. engine->destroy();
    50. builder->destroy();
    51. config->destroy();
    52. return 0;
    53. }

    模型推理

    使用python接口

    1. #输入一个 1x3x224x224 的张量,输出一个 1x3x112x112 的张量
    2. from typing import Union, Optional, Sequence,Dict,Any
    3. import torch
    4. import tensorrt as trt
    5. class TRTWrapper(torch.nn.Module):
    6. def __init__(self,engine: Union[str, trt.ICudaEngine],
    7. output_names: Optional[Sequence[str]] = None) -> None:
    8. super().__init__()
    9. self.engine = engine
    10. if isinstance(self.engine, str):
    11. with trt.Logger() as logger, trt.Runtime(logger) as runtime:
    12. with open(self.engine, mode='rb') as f:
    13. engine_bytes = f.read()
    14. self.engine = runtime.deserialize_cuda_engine(engine_bytes)
    15. self.context = self.engine.create_execution_context()
    16. names = [_ for _ in self.engine]
    17. input_names = list(filter(self.engine.binding_is_input, names))
    18. self._input_names = input_names
    19. self._output_names = output_names
    20. if self._output_names is None:
    21. output_names = list(set(names) - set(input_names))
    22. self._output_names = output_names
    23. def forward(self, inputs: Dict[str, torch.Tensor]):
    24. assert self._input_names is not None
    25. assert self._output_names is not None
    26. bindings = [None] * (len(self._input_names) + len(self._output_names))
    27. profile_id = 0
    28. for input_name, input_tensor in inputs.items():
    29. # check if input shape is valid
    30. profile = self.engine.get_profile_shape(profile_id, input_name)
    31. assert input_tensor.dim() == len(
    32. profile[0]), 'Input dim is different from engine profile.'
    33. for s_min, s_input, s_max in zip(profile[0], input_tensor.shape,
    34. profile[2]):
    35. assert s_min <= s_input <= s_max, \
    36. 'Input shape should be between ' \
    37. + f'{profile[0]} and {profile[2]}' \
    38. + f' but get {tuple(input_tensor.shape)}.'
    39. idx = self.engine.get_binding_index(input_name)
    40. # All input tensors must be gpu variables
    41. assert 'cuda' in input_tensor.device.type
    42. input_tensor = input_tensor.contiguous()
    43. if input_tensor.dtype == torch.long:
    44. input_tensor = input_tensor.int()
    45. self.context.set_binding_shape(idx, tuple(input_tensor.shape))
    46. bindings[idx] = input_tensor.contiguous().data_ptr()
    47. # create output tensors
    48. outputs = {}
    49. for output_name in self._output_names:
    50. idx = self.engine.get_binding_index(output_name)
    51. dtype = torch.float32
    52. shape = tuple(self.context.get_binding_shape(idx))
    53. device = torch.device('cuda')
    54. output = torch.empty(size=shape, dtype=dtype, device=device)
    55. outputs[output_name] = output
    56. bindings[idx] = output.data_ptr()
    57. self.context.execute_async_v2(bindings,
    58. torch.cuda.current_stream().cuda_stream)
    59. return outputs
    60. model = TRTWrapper('model.engine', ['output'])
    61. output = model(dict(input = torch.randn(1, 3, 224, 224).cuda()))
    62. print(output)

    c++接口

    1. #include
    2. #include
    3. #include
    4. #include <../samples/common/logger.h>
    5. #define CHECK(status) \
    6. do\
    7. {\
    8. auto ret = (status);\
    9. if (ret != 0)\
    10. {\
    11. std::cerr << "Cuda failure: " << ret << std::endl;\
    12. abort();\
    13. }\
    14. } while (0)
    15. using namespace nvinfer1;
    16. using namespace sample;
    17. const char* IN_NAME = "input";
    18. const char* OUT_NAME = "output";
    19. static const int IN_H = 224;
    20. static const int IN_W = 224;
    21. static const int BATCH_SIZE = 1;
    22. static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    23. void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
    24. {
    25. const ICudaEngine& engine = context.getEngine();
    26. // Pointers to input and output device buffers to pass to engine.
    27. // Engine requires exactly IEngine::getNbBindings() number of buffers.
    28. assert(engine.getNbBindings() == 2);
    29. void* buffers[2];
    30. // In order to bind the buffers, we need to know the names of the input and output tensors.
    31. // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    32. const int inputIndex = engine.getBindingIndex(IN_NAME);
    33. const int outputIndex = engine.getBindingIndex(OUT_NAME);
    34. // Create GPU buffers on device
    35. CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float)));
    36. CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 3 * IN_H * IN_W /4 * sizeof(float)));
    37. // Create stream
    38. cudaStream_t stream;
    39. CHECK(cudaStreamCreate(&stream));
    40. // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    41. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    42. context.enqueue(batchSize, buffers, stream, nullptr);
    43. CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * 3 * IN_H * IN_W / 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
    44. cudaStreamSynchronize(stream);
    45. // Release stream and buffers
    46. cudaStreamDestroy(stream);
    47. CHECK(cudaFree(buffers[inputIndex]));
    48. CHECK(cudaFree(buffers[outputIndex]));
    49. }
    50. int main(int argc, char** argv)
    51. {
    52. // create a model using the API directly and serialize it to a stream
    53. char *trtModelStream{ nullptr };
    54. size_t size{ 0 };
    55. std::ifstream file("model.engine", std::ios::binary);
    56. if (file.good()) {
    57. file.seekg(0, file.end);
    58. size = file.tellg();
    59. file.seekg(0, file.beg);
    60. trtModelStream = new char[size];
    61. assert(trtModelStream);
    62. file.read(trtModelStream, size);
    63. file.close();
    64. }
    65. Logger m_logger;
    66. IRuntime* runtime = createInferRuntime(m_logger);
    67. assert(runtime != nullptr);
    68. ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    69. assert(engine != nullptr);
    70. IExecutionContext* context = engine->createExecutionContext();
    71. assert(context != nullptr);
    72. // generate input data
    73. float data[BATCH_SIZE * 3 * IN_H * IN_W];
    74. for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++)
    75. data[i] = 1;
    76. // Run inference
    77. float prob[BATCH_SIZE * 3 * IN_H * IN_W /4];
    78. doInference(*context, data, prob, BATCH_SIZE);
    79. // Destroy the engine
    80. context->destroy();
    81. engine->destroy();
    82. runtime->destroy();
    83. return 0;
    84. }

  • 相关阅读:
    JS定时器实现页面N秒后跳转 实现每隔 1s 自动刷新页面并格式化的显示当前时间
    【Java】监听器
    CiteSpace for Mac 最新保姆级教程
    vivo自研AI大模型即将问世,智能手机行业加速迈向AI时代
    PX4飞行测试
    Stable Diffusion 本地部署教程
    我的测试开发十年之路
    C++学习——C++中const的新花样
    百面深度学习-自然语言处理
    力扣第79题 单词搜索
  • 原文地址:https://blog.csdn.net/zhuikefeng/article/details/133130275