提前安装好 CUDA 和 CUDNN,登录 NVIDIA 官方网站下载和主机 CUDA 版本适配的 TensorRT 压缩包即可。
以 CUDA 版本是 10.2 为例,选择适配 CUDA 10.2 的 tensorrt的tar包,然后执行类似如下的命令安装并测试:
- #安装c++版本
- cd /the/path/of/tensorrt/tar/gz/file
- tar -zxvf TensorRT-8.2.5.1.linux.x86_64-gnu.cuda-10.2.cudnn8.2.tar.gz
- export TENSORRT_DIR=$(pwd)/TensorRT-8.2.5.1
- export LD_LIBRARY_PATH=$TENSORRT_DIR/lib:$LD_LIBRARY_PATH
-
- #安装python版本
- pip install TensorRT-8.2.5.1/python/tensorrt-8.2.5.1-cp37-none-linux_x86_64.whl
- python -c "import tensorrt;print(tensorrt.__version__)" #打印8.2.5.1,则说明安装成功
- #pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt==9.2.0.post12.dev也可以
- import tensorrt as trt
-
- verbose = True
- IN_NAME = 'input'
- OUT_NAME = 'output'
- IN_H = 224
- IN_W = 224
- BATCH_SIZE = 1
-
- EXPLICIT_BATCH = 1 << (int)(
- trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-
- TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger()
- with trt.Builder(TRT_LOGGER) as builder, builder.create_builder_config(
- ) as config, builder.create_network(EXPLICIT_BATCH) as network:
- # define network
- input_tensor = network.add_input(
- name=IN_NAME, dtype=trt.float32, shape=(BATCH_SIZE, 3, IN_H, IN_W))
- pool = network.add_pooling(
- input=input_tensor, type=trt.PoolingType.MAX, window_size=(2, 2))
- pool.stride = (2, 2)
- pool.get_output(0).name = OUT_NAME
- network.mark_output(pool.get_output(0))
-
- # serialize the model to engine file
- profile = builder.create_optimization_profile()
- profile.set_shape_input('input', *[[BATCH_SIZE, 3, IN_H, IN_W]]*3)
- builder.max_batch_size = 1
- config.max_workspace_size = 1 << 30
- engine = builder.build_engine(network, config)
- with open('model_python_trt.engine', mode='wb') as f:
- f.write(bytearray(engine.serialize()))
- print("generating file done!")
- #include
- #include
-
- #include
- #include <../samples/common/logger.h>
-
- using namespace nvinfer1;
- using namespace sample;
-
- const char* IN_NAME = "input";
- const char* OUT_NAME = "output";
- static const int IN_H = 224;
- static const int IN_W = 224;
- static const int BATCH_SIZE = 1;
- static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-
- int main(int argc, char** argv)
- {
- // Create builder
- Logger m_logger;
- IBuilder* builder = createInferBuilder(m_logger);
- IBuilderConfig* config = builder->createBuilderConfig();
-
- // Create model to populate the network
- INetworkDefinition* network = builder->createNetworkV2(EXPLICIT_BATCH);
- ITensor* input_tensor = network->addInput(IN_NAME, DataType::kFLOAT, Dims4{ BATCH_SIZE, 3, IN_H, IN_W });
- IPoolingLayer* pool = network->addPoolingNd(*input_tensor, PoolingType::kMAX, DimsHW{ 2, 2 });
- pool->setStrideNd(DimsHW{ 2, 2 });
- pool->getOutput(0)->setName(OUT_NAME);
- network->markOutput(*pool->getOutput(0));
-
- // Build engine
- IOptimizationProfile* profile = builder->createOptimizationProfile();
- profile->setDimensions(IN_NAME, OptProfileSelector::kMIN, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
- profile->setDimensions(IN_NAME, OptProfileSelector::kOPT, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
- profile->setDimensions(IN_NAME, OptProfileSelector::kMAX, Dims4(BATCH_SIZE, 3, IN_H, IN_W));
- config->setMaxWorkspaceSize(1 << 20);
- ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
-
- // Serialize the model to engine file
- IHostMemory* modelStream{ nullptr };
- assert(engine != nullptr);
- modelStream = engine->serialize();
-
- std::ofstream p("model.engine", std::ios::binary);
- if (!p) {
- std::cerr << "could not open output file to save model" << std::endl;
- return -1;
- }
- p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
- std::cout << "generating file done!" << std::endl;
-
- // Release resources
- modelStream->destroy();
- network->destroy();
- engine->destroy();
- builder->destroy();
- config->destroy();
- return 0;
- }
失败尝试:下载tensorrt的官方镜像,试图使用镜像中的trtexec,但是在驱动版本为515.105的A800机器上,不论哪个版本的镜像,都会报cuda版本跟驱动版本不匹配的错误
- docker pull nvcr.io/nvidia/tensorrt:xx.xx-py3
- #第一个xx表示年份,第二个xx表示月份,如22.09,23.01
成功尝试:直接下载tensorrt的tar包,然后在bin目录下执行trtexec命令,好像是不依赖cuda和cudnn的,但是同样要注意,不同版本的tensorrt编译出来的engine文件,在别的版本tensorrt下可能不能运行。
- import torch
- import onnx
- import tensorrt as trt
-
-
- onnx_model = 'model.onnx'
-
- class NaiveModel(torch.nn.Module):
- def __init__(self):
- super().__init__()
- self.pool = torch.nn.MaxPool2d(2, 2)
-
- def forward(self, x):
- return self.pool(x)
-
- device = torch.device('cuda:0')
-
- # generate ONNX model
- torch.onnx.export(NaiveModel(), torch.randn(1, 3, 224, 224), onnx_model, input_names=['input'], output_names=['output'], opset_version=11)
- onnx_model = onnx.load(onnx_model)
-
- # create builder and network
- logger = trt.Logger(trt.Logger.ERROR)
- builder = trt.Builder(logger)
- EXPLICIT_BATCH = 1 << (int)(
- trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
- network = builder.create_network(EXPLICIT_BATCH)
-
- # parse onnx
- parser = trt.OnnxParser(network, logger)
-
- if not parser.parse(onnx_model.SerializeToString()):
- error_msgs = ''
- for error in range(parser.num_errors):
- error_msgs += f'{parser.get_error(error)}\n'
- raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
-
- config = builder.create_builder_config()
- config.max_workspace_size = 1<<20
- profile = builder.create_optimization_profile()
-
- profile.set_shape('input', [1,3 ,224 ,224], [1,3,224, 224], [1,3 ,224 ,224])
- config.add_optimization_profile(profile)
- # create engine
- with torch.cuda.device(device):
- engine = builder.build_engine(network, config)
-
- with open('model.engine', mode='wb') as f:
- f.write(bytearray(engine.serialize()))
- print("generating file done!")
- #include
- #include
-
- #include
- #include
- #include <../samples/common/logger.h>
-
- using namespace nvinfer1;
- using namespace nvonnxparser;
- using namespace sample;
-
- int main(int argc, char** argv)
- {
- // Create builder
- Logger m_logger;
- IBuilder* builder = createInferBuilder(m_logger);
- const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
- IBuilderConfig* config = builder->createBuilderConfig();
-
- // Create model to populate the network
- INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
-
- // Parse ONNX file
- IParser* parser = nvonnxparser::createParser(*network, m_logger);
- bool parser_status = parser->parseFromFile("model.onnx", static_cast<int>(ILogger::Severity::kWARNING));
-
- // Get the name of network input
- Dims dim = network->getInput(0)->getDimensions();
- if (dim.d[0] == -1) // -1 means it is a dynamic model
- {
- const char* name = network->getInput(0)->getName();
- IOptimizationProfile* profile = builder->createOptimizationProfile();
- profile->setDimensions(name, OptProfileSelector::kMIN, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
- profile->setDimensions(name, OptProfileSelector::kOPT, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
- profile->setDimensions(name, OptProfileSelector::kMAX, Dims4(1, dim.d[1], dim.d[2], dim.d[3]));
- config->addOptimizationProfile(profile);
- }
-
-
- // Build engine
- config->setMaxWorkspaceSize(1 << 20);
- ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
-
- // Serialize the model to engine file
- IHostMemory* modelStream{ nullptr };
- assert(engine != nullptr);
- modelStream = engine->serialize();
-
- std::ofstream p("model.engine", std::ios::binary);
- if (!p) {
- std::cerr << "could not open output file to save model" << std::endl;
- return -1;
- }
- p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
- std::cout << "generate file success!" << std::endl;
-
- // Release resources
- modelStream->destroy();
- network->destroy();
- engine->destroy();
- builder->destroy();
- config->destroy();
- return 0;
- }
- #输入一个 1x3x224x224 的张量,输出一个 1x3x112x112 的张量
- from typing import Union, Optional, Sequence,Dict,Any
-
- import torch
- import tensorrt as trt
-
- class TRTWrapper(torch.nn.Module):
- def __init__(self,engine: Union[str, trt.ICudaEngine],
- output_names: Optional[Sequence[str]] = None) -> None:
- super().__init__()
- self.engine = engine
- if isinstance(self.engine, str):
- with trt.Logger() as logger, trt.Runtime(logger) as runtime:
- with open(self.engine, mode='rb') as f:
- engine_bytes = f.read()
- self.engine = runtime.deserialize_cuda_engine(engine_bytes)
- self.context = self.engine.create_execution_context()
- names = [_ for _ in self.engine]
- input_names = list(filter(self.engine.binding_is_input, names))
- self._input_names = input_names
- self._output_names = output_names
-
- if self._output_names is None:
- output_names = list(set(names) - set(input_names))
- self._output_names = output_names
-
- def forward(self, inputs: Dict[str, torch.Tensor]):
- assert self._input_names is not None
- assert self._output_names is not None
- bindings = [None] * (len(self._input_names) + len(self._output_names))
- profile_id = 0
- for input_name, input_tensor in inputs.items():
- # check if input shape is valid
- profile = self.engine.get_profile_shape(profile_id, input_name)
- assert input_tensor.dim() == len(
- profile[0]), 'Input dim is different from engine profile.'
- for s_min, s_input, s_max in zip(profile[0], input_tensor.shape,
- profile[2]):
- assert s_min <= s_input <= s_max, \
- 'Input shape should be between ' \
- + f'{profile[0]} and {profile[2]}' \
- + f' but get {tuple(input_tensor.shape)}.'
- idx = self.engine.get_binding_index(input_name)
-
- # All input tensors must be gpu variables
- assert 'cuda' in input_tensor.device.type
- input_tensor = input_tensor.contiguous()
- if input_tensor.dtype == torch.long:
- input_tensor = input_tensor.int()
- self.context.set_binding_shape(idx, tuple(input_tensor.shape))
- bindings[idx] = input_tensor.contiguous().data_ptr()
-
- # create output tensors
- outputs = {}
- for output_name in self._output_names:
- idx = self.engine.get_binding_index(output_name)
- dtype = torch.float32
- shape = tuple(self.context.get_binding_shape(idx))
-
- device = torch.device('cuda')
- output = torch.empty(size=shape, dtype=dtype, device=device)
- outputs[output_name] = output
- bindings[idx] = output.data_ptr()
- self.context.execute_async_v2(bindings,
- torch.cuda.current_stream().cuda_stream)
- return outputs
-
- model = TRTWrapper('model.engine', ['output'])
- output = model(dict(input = torch.randn(1, 3, 224, 224).cuda()))
- print(output)
- #include
- #include
-
- #include
- #include <../samples/common/logger.h>
-
- #define CHECK(status) \
- do\
- {\
- auto ret = (status);\
- if (ret != 0)\
- {\
- std::cerr << "Cuda failure: " << ret << std::endl;\
- abort();\
- }\
- } while (0)
-
- using namespace nvinfer1;
- using namespace sample;
-
- const char* IN_NAME = "input";
- const char* OUT_NAME = "output";
- static const int IN_H = 224;
- static const int IN_W = 224;
- static const int BATCH_SIZE = 1;
- static const int EXPLICIT_BATCH = 1 << (int)(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-
-
- void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
- {
- const ICudaEngine& engine = context.getEngine();
-
- // Pointers to input and output device buffers to pass to engine.
- // Engine requires exactly IEngine::getNbBindings() number of buffers.
- assert(engine.getNbBindings() == 2);
- void* buffers[2];
-
- // In order to bind the buffers, we need to know the names of the input and output tensors.
- // Note that indices are guaranteed to be less than IEngine::getNbBindings()
- const int inputIndex = engine.getBindingIndex(IN_NAME);
- const int outputIndex = engine.getBindingIndex(OUT_NAME);
-
- // Create GPU buffers on device
- CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * IN_H * IN_W * sizeof(float)));
- CHECK(cudaMalloc(&buffers[outputIndex], batchSize * 3 * IN_H * IN_W /4 * sizeof(float)));
-
- // Create stream
- cudaStream_t stream;
- CHECK(cudaStreamCreate(&stream));
-
- // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
- CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * IN_H * IN_W * sizeof(float), cudaMemcpyHostToDevice, stream));
- context.enqueue(batchSize, buffers, stream, nullptr);
- CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * 3 * IN_H * IN_W / 4 * sizeof(float), cudaMemcpyDeviceToHost, stream));
- cudaStreamSynchronize(stream);
-
- // Release stream and buffers
- cudaStreamDestroy(stream);
- CHECK(cudaFree(buffers[inputIndex]));
- CHECK(cudaFree(buffers[outputIndex]));
- }
-
- int main(int argc, char** argv)
- {
- // create a model using the API directly and serialize it to a stream
- char *trtModelStream{ nullptr };
- size_t size{ 0 };
-
- std::ifstream file("model.engine", std::ios::binary);
- if (file.good()) {
- file.seekg(0, file.end);
- size = file.tellg();
- file.seekg(0, file.beg);
- trtModelStream = new char[size];
- assert(trtModelStream);
- file.read(trtModelStream, size);
- file.close();
- }
-
- Logger m_logger;
- IRuntime* runtime = createInferRuntime(m_logger);
- assert(runtime != nullptr);
- ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
- assert(engine != nullptr);
- IExecutionContext* context = engine->createExecutionContext();
- assert(context != nullptr);
-
- // generate input data
- float data[BATCH_SIZE * 3 * IN_H * IN_W];
- for (int i = 0; i < BATCH_SIZE * 3 * IN_H * IN_W; i++)
- data[i] = 1;
-
- // Run inference
- float prob[BATCH_SIZE * 3 * IN_H * IN_W /4];
- doInference(*context, data, prob, BATCH_SIZE);
-
- // Destroy the engine
- context->destroy();
- engine->destroy();
- runtime->destroy();
- return 0;
- }