• OpenMMlab导出swin-transformer模型并用onnxruntime和tensorrt推理


    导出onnx文件

    通过mmpretrain 导出swin-transformer的onnx文件非常容易,注意需设置 opset_version=12这里是一个坑,刚开始设置的opset_version=11后续转换trtengine的时候会有问题。

    import torch
    from mmpretrain import get_model, inference_model
    
    
    model = get_model('swin-tiny_16xb64_in1k',pretrained='swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth', device='cpu') 
    
    input = torch.zeros(1, 3, 224, 224)
    out = model(input)
    torch.onnx.export(model, input, "swin_transformer.onnx", opset_version=12)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    如果安装了mmdeploy,也可以通过下面的脚本进行模型转换

    from mmdeploy.apis import torch2onnx
    from mmdeploy.backend.sdk.export_info import export2SDK
    
    
    img = 'goldfish.jpg'
    work_dir = './work_dir/onnx/swin_transformer'
    save_file = './end2end.onnx'
    deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_dynamic.py'
    model_cfg = 'mmpretrain/configs/swin_transformer/swin-tiny_16xb64_in1k.py'
    model_checkpoint = './checkpoints/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth'
    device = 'cpu'
    
    # 1. convert model to onnx
    torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, model_checkpoint, device)
    
    # 2. extract pipeline info for sdk use (dump-info)
    export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17

    onnxruntime推理

    python推理:

    import cv2
    import numpy as np
    import onnxruntime
    
    
    if __name__ == '__main__':
        img = cv2.imread('goldfish.jpg')
        if img.shape[0] < img.shape[1]: #h
            img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256))
        else:
            img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1])))
        
        crop_size = min(img.shape[0], img.shape[1])
        left = int((img.shape[1]-crop_size)/2)
        top = int((img.shape[0]-crop_size)/2)
        img_crop = img[top:top+crop_size, left:left+crop_size]
        img_crop = cv2.resize(img_crop, (224,224))
        
        img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32)  #BGR2RGB和HWC2CHW
        img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395   
        img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12
        img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375
        input = np.expand_dims(img_crop, axis=0)   
    
        onnx_session = onnxruntime.InferenceSession("swin_transformer.onnx", providers=['CUDAExecutionProvider','CPUExecutionProvider'])
    
        input_name=[]
        for node in onnx_session.get_inputs():
            input_name.append(node.name)
    
        output_name=[]
        for node in onnx_session.get_outputs():
            output_name.append(node.name)
    
        input_feed={}
        for name in input_name:
            input_feed[name] = input
    
        pred = onnx_session.run(None, input_feed)[0]
        print(np.argmax(pred))
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40

    C++推理:

    #include 
    #include 
    #include 
    
    
    int main(int argc, char* argv[])
    {
    	Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "cls");
    	Ort::SessionOptions session_options;
    	session_options.SetIntraOpNumThreads(1);
    	session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
    
    	//OrtCUDAProviderOptions cuda_option;
    	//cuda_option.device_id = 0;
    	//cuda_option.arena_extend_strategy = 0;
    	//cuda_option.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;
    	//cuda_option.gpu_mem_limit = SIZE_MAX;
    	//cuda_option.do_copy_in_default_stream = 1;
    	//session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
    	//session_options.AppendExecutionProvider_CUDA(cuda_option);
    
    	const wchar_t* model_path = L"swin_transformer.onnx";
    	Ort::Session session(env, model_path, session_options);
    	Ort::AllocatorWithDefaultOptions allocator;
    
    	size_t num_input_nodes = session.GetInputCount();
    	std::vector<const char*> input_node_names = { "input" };
    	std::vector<const char*> output_node_names = { "output" };
    
    	cv::Mat image = cv::imread("goldfish.jpg", 1);
    	cv::resize(image, image, cv::Size(224, 224));
    
    	const size_t input_tensor_size = 1 * image.channels() * image.cols * image.rows;
    	std::vector<float> input_tensor_values(input_tensor_size);
    
    	for (int i = 0; i < image.cols; i++)
    	{
    		for (int j = 0; j < image.rows; j++)
    		{
    			input_tensor_values[0 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 2] - 123.675) / 58.395;
    			input_tensor_values[1 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 1] - 116.28) / 57.12;
    			input_tensor_values[2 * image.cols * image.rows + i * image.rows + j] = (image.ptr<uchar>(i)[j * 3 + 0] - 103.53) / 57.375;
    		}
    	}
    	
    	std::vector<int64_t> input_node_dims = { 1, image.channels(), image.cols, image.rows };
    	auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
    	Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input_tensor_values.data(), input_tensor_size, input_node_dims.data(), input_node_dims.size());
    
    	std::vector<Ort::Value> ort_inputs;
    	ort_inputs.push_back(std::move(input_tensor));
    
    	std::vector<Ort::Value> output_tensors = session.Run(Ort::RunOptions{ nullptr }, input_node_names.data(), ort_inputs.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());
    
    	const float* rawOutput = output_tensors[0].GetTensorData<float>();
    	std::vector<int64_t> outputShape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
    	size_t count = output_tensors[0].GetTensorTypeAndShapeInfo().GetElementCount();
    	std::vector<float> output(rawOutput, rawOutput + count);
    
    	int predict_label = std::max_element(output.begin(), output.end()) - output.begin();
    	std::cout << predict_label << std::endl;
    
    	return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64

    如果安装了mmdeploy,也可以这样进行推理:
    python推理:

    from mmdeploy.apis import inference_model
    
    
    model_cfg = 'mmpretrain/configs/swin_transformer/swin-tiny_16xb64_in1k.py'
    deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_static.py'
    img = 'goldfish.jpg'
    backend_files = ['work_dir/onnx/swin_transformer/end2end.onnx']
    device = 'cpu'
    
    result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
    print(result)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    import cv2
    from mmdeploy_runtime import Classifier
    
    
    img = cv2.imread('goldfish.jpg')
    classifier = Classifier(model_path='work_dir/onnx/swin_transformer', device_name='cpu')
    result = classifier(img)
    for label_id, score in result:
        print(label_id, score)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    C++推理可以参考:https://github.com/open-mmlab/mmdeploy/blob/main/demo/csrc/c/image_classification.cpp
    https://github.com/open-mmlab/mmdeploy/blob/main/demo/csrc/cpp/classifier.cxx

    导出engine文件

    这里通过trtexec转换onnx文件,LZ的版本是TensorRT-8.2.1.8。
    需要先使用onnxsim简化模型,这里是第二个坑,否则会报错。

    import onnx
    from onnxsim import simplify
    
    
    onnx_model = onnx.load("swin_transformer.onnx")  # load onnx model
    model_simp, check = simplify(onnx_model)
    assert check, "Simplified ONNX model could not be validated"
    onnx.save(model_simp, "swin_transformer_sim.onnx")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    再去ensorRT的bin目录下运行

    ./trtexec.exe --onnx=swin_transformer_sim.onnx --saveEngine=swin_transformer.engine --workspace=20480
    
    • 1

    第三个坑是如果不加上–workspace参数可能会因内存不足报错,LZ的机器有32G内存索性就设了20G的工作空间,可以根据自己的内存大小酌情设置该参数。
    至此,不出意外可以成功导出engine文件。
    mmdeploy导出LZ没有尝试成功,不知道是环境配置问题还是在windows系统下库的bug。

    tensorrt推理

    python推理:

    import cv2
    import cv2
    import numpy as np
    import tensorrt as trt
    import pycuda.autoinit  #负责数据初始化,内存管理,销毁等
    import pycuda.driver as cuda  #GPU CPU之间的数据传输
    
    
    if __name__ == '__main__':
        # 创建logger:日志记录器
        logger = trt.Logger(trt.Logger.WARNING)
        # 创建runtime并反序列化生成engine
        with open("swin_transformer.engine", "rb") as f, trt.Runtime(logger) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()
        # 分配CPU锁页内存和GPU显存
        h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
        h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # 创建cuda流
        stream = cuda.Stream()
    
        img = cv2.imread('goldfish.jpg')
        if img.shape[0] < img.shape[1]: #h
            img = cv2.resize(img, (int(256*img.shape[1]/img.shape[0]), 256))
        else:
            img = cv2.resize(img, (256, int(256*img.shape[0]/img.shape[1])))
        
        crop_size = min(img.shape[0], img.shape[1])
        left = int((img.shape[1]-crop_size)/2)
        top = int((img.shape[0]-crop_size)/2)
        img_crop = img[top:top+crop_size, left:left+crop_size]
        img_crop = cv2.resize(img_crop, (224,224))
        
        img_crop = img_crop[:,:,::-1].transpose(2,0,1).astype(np.float32)  #BGR2RGB和HWC2CHW
        img_crop[0,:] = (img_crop[0,:] - 123.675) / 58.395   
        img_crop[1,:] = (img_crop[1,:] - 116.28) / 57.12
        img_crop[2,:] = (img_crop[2,:] - 103.53) / 57.375
        input = np.expand_dims(img_crop, axis=0)   
        
        np.copyto(h_input, input.ravel())
    
        # 创建context并进行推理
        with engine.create_execution_context() as context:
            # Transfer input data to the GPU.
            cuda.memcpy_htod_async(d_input, h_input, stream)
            # Run inference.
            context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
            # Transfer predictions back from the GPU.
            cuda.memcpy_dtoh_async(h_output, d_output, stream)
            # Synchronize the stream
            stream.synchronize()
            # Return the host output. 该数据等同于原始模型的输出数据
            pred = np.argmax(h_output)
            print(pred)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56

    C++推理:

    // tensorRT include
    #include 
    #include 
    #include  // onnx解析器的头文件
    
    // cuda include
    #include 
    #include 
    
    // system include
    #include 
    #include 
    #include 
    
    
    inline const char* severity_string(nvinfer1::ILogger::Severity t) {
    	switch (t) {
    	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
    	case nvinfer1::ILogger::Severity::kERROR:   return "error";
    	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
    	case nvinfer1::ILogger::Severity::kINFO:    return "info";
    	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
    	default: return "unknow";
    	}
    }
    
    class TRTLogger : public nvinfer1::ILogger {
    public:
    	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override {
    		if (severity <= Severity::kINFO) {
    			if (severity == Severity::kWARNING)
    				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
    			else if (severity <= Severity::kERROR)
    				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
    			else
    				printf("%s: %s\n", severity_string(severity), msg);
    		}
    	}
    } logger;
    
    std::vector<unsigned char> load_file(const std::string & file) {
    	std::ifstream in(file, std::ios::in | std::ios::binary);
    	if (!in.is_open())
    		return {};
    
    	in.seekg(0, std::ios::end);
    	size_t length = in.tellg();
    
    	std::vector<uint8_t> data;
    	if (length > 0) {
    		in.seekg(0, std::ios::beg);
    		data.resize(length);
    
    		in.read((char*)& data[0], length);
    	}
    	in.close();
    	return data;
    }
    
    
    void inference() {
    	// ------------------------------ 1. 准备模型并加载   ----------------------------
    	TRTLogger logger;
    	auto engine_data = load_file("swin_transformer.engine");
    	// 执行推理前,需要创建一个推理的runtime接口实例。与builer一样,runtime需要logger:
    	nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger);
    	// 将模型从读取到engine_data中,则可以对其进行反序列化以获得engine
    	nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
    	if (engine == nullptr) {
    		printf("Deserialize cuda engine failed.\n");
    		runtime->destroy();
    		return;
    	}
    
    	nvinfer1::IExecutionContext* execution_context = engine->createExecutionContext();
    	cudaStream_t stream = nullptr;
    	// 创建CUDA流,以确定这个batch的推理是独立的
    	cudaStreamCreate(&stream);
    
    	// ------------------------------ 2. 准备好要推理的数据并搬运到GPU   ----------------------------
    	cv::Mat image = cv::imread("goldfish.jpg", 1);
    	cv::resize(image, image, cv::Size(224, 224));
    	cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
    	int input_numel = 1 * image.channels() * image.rows * image.cols;
    
    	float* input_data_host = nullptr;
    	cudaMallocHost(&input_data_host, input_numel * sizeof(float));
    
    	int image_area = image.cols * image.rows;
    	unsigned char* pimage = image.data;
    	float* phost_r = input_data_host + image_area * 0;
    	float* phost_g = input_data_host + image_area * 1;
    	float* phost_b = input_data_host + image_area * 2;
    	for (int i = 0; i < image_area; ++i, pimage += 3) {
    		*phost_r++ = (pimage[0] - 123.675) / 58.395;
    		*phost_g++ = (pimage[1] - 116.28 )/ 57.12;
    		*phost_b++ = (pimage[2] - 103.53 )/ 57.375;
    	}
    
    	float* input_data_device = nullptr;
    	float output_data_host[1000];
    	float* output_data_device = nullptr;
    	cudaMalloc(&input_data_device, input_numel * sizeof(float));
    	cudaMalloc(&output_data_device, sizeof(output_data_host));
    	cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream);
    
    	// 用一个指针数组指定input和output在gpu中的指针
    	float* bindings[] = { input_data_device, output_data_device };
    
    	// ------------------------------ 3. 推理并将结果搬运回CPU   ----------------------------
    	bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
    	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream);
    	cudaStreamSynchronize(stream);
    
    	int predict_label = std::max_element(output_data_host, output_data_host + 1000) - output_data_host;
    	std::cout << "predict_label: " << predict_label << std::endl;
    
    	// ------------------------------ 4. 释放内存 ----------------------------
    	cudaStreamDestroy(stream);
    	execution_context->destroy();
    	engine->destroy();
    	runtime->destroy();
    }
    
    int main() {
    	inference();
    	return 0;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128

    PS:今天换了台机器又发现第四个坑,onnxsim == 0.4.33可以简化模型的python脚本,而onnxsim == 0.4.35和onnxsim == 0.4.38均失败。

    如果安装了mmdeploy,也可以这样进行推理:
    python推理:

    from mmdeploy.apis import inference_model
    
    
    model_cfg = 'mmpretrain/configs/swin_transformer/swin-tiny_16xb64_in1k.py'
    deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_tensorrt_dynamic-224x224-224x224.py'
    backend_files = ['work_dir/trt/swin_transformer/end2end.engine']
    img = 'goldfish.jpg'
    device = 'cuda'
    
    result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
    print(result)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    或者

    import cv2
    from mmdeploy_runtime import Classifier
    
    
    img = cv2.imread('goldfish.jpg')
    classifier = Classifier(model_path='work_dir/trt/swin_transformer', device_name='cuda')
    result = classifier(img)
    for label_id, score in result:
        print(label_id, score)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    C++推理可以参考:https://github.com/open-mmlab/mmdeploy/blob/main/demo/csrc/c/image_classification.cpp
    https://github.com/open-mmlab/mmdeploy/blob/main/demo/csrc/cpp/classifier.cxx

  • 相关阅读:
    数据库表结构设计
    【论文阅读】23_SIGIR_Disentangled Contrastive Collaborative Filtering(分离对比协同过滤)
    ChatGPT在工业领域的研究与应用探索-AI助手实验应用
    JavaEE初阶——计算机是如何工作的
    C++实现KNN和K-Means
    AUTOSAR知识点 之 Dem (二):SPEC规范解读
    那些上线MES系统的企业都怎么样了?
    UDS知识整理(二):UDS诊断服务简介
    SolrCloud环境搭建
    【信创】 银河麒麟 软件目录及 常见问题汇总(持续更新)
  • 原文地址:https://blog.csdn.net/taifyang/article/details/133962084