https://github.com/wang-xinyu/pytorchx
https://github.com/wang-xinyu/tensorrtx
本文挑选 lenet 和 mlp 两个简单的例子,学习 TensorRT 的 C++ API。
代码部分会略过 assert
判定和 destroy
释放空间部分,仅保留较为核心的部分。
pytorchx 里面就 model.py
和 inference.py
两个文件。model.py
用来构建 pytorch 模型并保存权重 model.pth
,inference.py
把 model.pth
转为 model.wts
。
这个 model.wts
并不复杂,只是把模型的权重按文本格式存了下来,用于在 C++ 中读取模型权重:
# mlp
2
linear.weight 1 3fffdecf
linear.bias 1 3b16e580
# lenet
10
conv1.weight 150 be40ee1b bd20baba ...
conv1.bias 6 bd32705a 3e2182a8 ...
conv2.weight 2400 3c6f2224 3c69308f ...
conv2.bias 16 bd183967 bcb1ac89 ...
fc1.weight 48000 3c162c20 bd25196a ...
fc1.bias 120 3d3c3d4a bc64b947 ...
fc2.weight 10080 bce095a3 3d33b9dc ...
fc2.bias 84 bc71eaa0 3d9b276d ...
fc3.weight 840 3c25286d 3d855351 ...
fc3.bias 10 bdbe4bb8 3b119ed1 ...
Python 干的事情到此为止,代码比较容易就不分析了,后面都是痛苦的 C++ 环节。
这个过程暂时不做深入的研究,核心是 CMakeLists.txt
,其中的内容大同小异,主要还是看 model.cpp
中的内容。这里按 ./model -s
生成引擎 model.engine
, ./model -d
运行推理两部分看。
这两个模型主流程是一样的:创建 modelStream
→
\to
→ 用 APIToModel
构建模型
→
\to
→ 写入 model.engine
IHostMemory* modelStream{nullptr};
APIToModel(1, &modelStream);
std::ofstream p("../model.engine", std::ios::binary);
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
APIToModel
流程仍然是一样的,从 logging.h
中读取配置信息,创建模型,作者对这个头文件的说明是 A logger file for using NVIDIA TRT API (mostly same for all models),应该也是直接用就行,暂时跳过。可以看出核心在于 createModelEngine
。
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream){
// Create builder with the help of logger
IBuilder *builder = createInferBuilder(gLogger);
// Create hardware configs
IBuilderConfig *config = builder->createBuilderConfig();
// Build an engine
ICudaEngine* engine = createModelEngine(maxBatchSize, builder, config, DataType::kFLOAT);
assert(engine != nullptr);
// serialize the engine into binary stream
(*modelStream) = engine->serialize();
// free up the memory
engine->destroy();
builder->destroy();
}
createModelEngine
(1)流程
除了构建网络部分,只有设定 workspace size 有所不同,官方博客中有对 setMaxWorkspaceSize
的描述,大致就是分配 GPU 空间,1ULL << 30
是 1GB,那分配空间单位就是字节。
"两个函数的输入是相同的"
ICudaEngine* createMLPEngine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt)
ICudaEngine* createLenetEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
"读取权重, wts文件的格式是相同的, 估计是通用的, 读进来是个字典, 跳过读取细节"
std::map<std::string, Weights> weightMap = loadWeights("../mlp.wts");
std::map<std::string, Weights> weightMap = loadWeights("../lenet5.wts");
"创建空模型"
INetworkDefinition* network = builder->createNetworkV2(0U);
INetworkDefinition* network = builder->createNetworkV2(0U);
"创建输入, name 和 type 是一样的, lenet的维度是 1, 28, 28"
"这里看起来比pytorch的输入少了一个batchsize的维度, 但在后面有 setMaxBatchSize"
ITensor* data = network->addInput("data", DataType::kFLOAT, Dims3{1, 1, 1});
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W});
"构建网络, 后面单独说明"
"Set configurations"
builder->setMaxBatchSize(1);
builder->setMaxBatchSize(maxBatchSize);
"Set workspace size"
config->setMaxWorkspaceSize(1 << 20);
config->setMaxWorkspaceSize(16 << 20);
"Build CUDA Engine using network and configurations"
ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config);
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
(2)构建网络部分
// mlp
IFullyConnectedLayer *fc1 = network->addFullyConnected(*data, 1, weightMap["linear.weight"], weightMap["linear.bias"]);
fc1->getOutput(0)->setName("out");
network->markOutput(*fc1->getOutput(0));
mlp 的结构太过简单,直接看 lenet。首先回顾一下 lenet 的结构:
[
1
,
1
,
32
,
32
]
[1,1,32,32]
[1,1,32,32]
→
\to
→ Conv2d(1,6,5,1,0)+relu
→
\to
→
[
1
,
6
,
28
,
28
]
[1,6,28,28]
[1,6,28,28]
→
\to
→ AvgPool2d(2,2,0)
→
\to
→
[
1
,
6
,
14
,
14
]
[1,6,14,14]
[1,6,14,14]
→
\to
→ Conv2d(6,16,5,1,0)+relu
→
\to
→
[
1
,
16
,
10
,
10
]
[1,16,10,10]
[1,16,10,10]
→
\to
→ AvgPool2d(2,2,0)
→
\to
→
[
1
,
16
,
5
,
5
]
[1,16,5,5]
[1,16,5,5]
→
\to
→
[
1
,
400
]
[1,400]
[1,400]
→
\to
→ Linear(400,120)+relu
→
\to
→
[
1
,
120
]
[1,120]
[1,120]
→
\to
→ Linear(120,84)+relu
→
\to
→
[
1
,
84
]
[1,84]
[1,84]
→
\to
→ Linear(84,10)+softmax
→
\to
→
[
1
,
10
]
[1,10]
[1,10]
和 pytorch 的 API 对应可看出 tensorrt API 的用法:
卷积:addConvolutionNd(输入, 输出维度, kernel大小, 权重, 偏置)
激活:addActivation(输入, 激活函数类型)
池化:addPoolingNd(输入, 池化模式, kernel大小)
setStrideNd
用来设定卷积和池化的步长
全连接:addFullyConnected(输入, 输出维度, 权重, 偏置)
最后会给最后一层的输出设定名称,并让网络把它标记为输出。
// lenet
IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 6, DimsHW{5, 5}, weightMap["conv1.weight"], weightMap["conv1.bias"]);
conv1->setStrideNd(DimsHW{1, 1});
IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
pool1->setStrideNd(DimsHW{2, 2});
IConvolutionLayer* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 16, DimsHW{5, 5}, weightMap["conv2.weight"], weightMap["conv2.bias"]);
conv2->setStrideNd(DimsHW{1, 1});
IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
IPoolingLayer* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
pool2->setStrideNd(DimsHW{2, 2});
IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 120, weightMap["fc1.weight"], weightMap["fc1.bias"]);
IActivationLayer* relu3 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU);
IFullyConnectedLayer* fc2 = network->addFullyConnected(*relu3->getOutput(0), 84, weightMap["fc2.weight"], weightMap["fc2.bias"]);
IActivationLayer* relu4 = network->addActivation(*fc2->getOutput(0), ActivationType::kRELU);
IFullyConnectedLayer* fc3 = network->addFullyConnected(*relu4->getOutput(0), OUTPUT_SIZE, weightMap["fc3.weight"], weightMap["fc3.bias"]);
ISoftMaxLayer* prob = network->addSoftMax(*fc3->getOutput(0));
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*prob->getOutput(0));
整个推理过程除了 doInference
用的都是 tensorrt 的 API,可以看出只需要用现成的 logging.h
和 model.engine
即可。
// create a model using the API directly and serialize it to a stream
char *trtModelStream{nullptr};
size_t size{0};
// read model from the engine file
std::ifstream file("../model.engine", std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
// create a runtime (required for deserialization of model) with NVIDIA's logger
IRuntime *runtime = createInferRuntime(gLogger);
// deserialize engine for using the char-stream
deserializeCudaEngine
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
// create execution context -- required for inference executions
IExecutionContext *context = engine->createExecutionContext();
// 创建输入输出
float data[INPUT_SIZE]; // mlp:1 lenet:H*W
float out[OUTPUT_SIZE]; // mlp:1 lenet:10
// time the execution
auto start = std::chrono::system_clock::now();
// do inference using the parameters
doInference(*context, data, out, 1);
// time the execution
auto end = std::chrono::system_clock::now();
doInference
lenet 和 mlp 的 doInference
也是相同的,推理其实就一句 context.enqueue(batchSize, buffers, stream, nullptr);
,其余的部分都是在做创建存放输入输出空间、创建cuda流、数据在CPU和GPU之间搬运的工作。
看了下yolov5的代码,运行推理其实也就一句,主要工作还是在于准备数据。
void doInference(IExecutionContext &context, float *input, float *output, int batchSize) {
// Get engine from the context
const ICudaEngine &engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void *buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex("data");
const int outputIndex = engine.getBindingIndex("out");
// Create GPU buffers on device -- allocate memory for input and output
cudaMalloc(&buffers[inputIndex], batchSize * INPUT_SIZE * sizeof(float));
cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float));
// create CUDA stream for simultaneous CUDA operations
cudaStream_t stream;
cudaStreamCreate(&stream);
// copy input from host (CPU) to device (GPU) in stream
cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream);
// execute inference using context provided by engine
context.enqueue(batchSize, buffers, stream, nullptr);
// copy output back from device (GPU) to host (CPU)
cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost,
stream);
// synchronize the stream to prevent issues
// (block CUDA and wait for CUDA operations to be completed)
cudaStreamSynchronize(stream);
// Release stream and buffers (memory)
cudaStreamDestroy(stream);
cudaFree(buffers[inputIndex]);
cudaFree(buffers[outputIndex]);
}