• 47、以Orienmask实例分割算法为例子,学习TensorRT的Python和C++开发


    基本思想:一直想学tensorRT开发,没时间,最近有时间,学习一下TensorRT开发,这篇文章的资料大部分来自网络和手册,以学习的目的,促进自己的任务目标实现

    配置环境 50、ubuntu18.04&20.04+CUDA11.1+cudnn11.3+TensorRT7.2+Deepsteam5.1+vulkan环境搭建和YOLO5部署_sxj731533730的博客-CSDN博客

    1. ubuntu@ubuntu:~/glog/build$ python3
    2. Python 3.8.10 (default, Nov 26 2021, 20:14:08)
    3. [GCC 9.3.0] on linux
    4. Type "help", "copyright", "credits" or "license" for more information.
    5. >>> import tensorrt
    6. >>> tensorrt.__version__
    7. '7.2.3.4

    Documentation Archives :: NVIDIA Deep Learning TensorRT Documentation

    测试环境 显卡GTX1060 笔记本

    第一步:基本的tensorRT结构分析。参考https://www.jianshu.com/p/3c2fb7b45cc7

    1)首先以trt的Logger为参数,使用builder创建计算图类型INetworkDefinition。
    2)然后使用Parsers将onnx等网络框架下的结构填充计算图,当然也可以使用tensorrt的API进行构建。
    3)由计算图创建cuda环境下的引擎
    4)最终进行推理的则是cuda引擎生成的ExecutionContext。engine.create_execution_context()

    补充一个测试onnx测试时间代码

    1. import matplotlib.pyplot as plt
    2. from torch.autograd import Variable
    3. from argparse import ArgumentParser
    4. import torch
    5. import torch.utils.data
    6. import onnxruntime
    7. import cv2
    8. import numpy as np
    9. from onnxruntime.datasets import get_example
    10. import torch.nn.functional as F
    11. import math
    12. from model.orienmask_yolo_fpnplus import OrienMaskYOLOFPNPlus
    13. from utils.visualizer import InferenceVisualizer
    14. from torch.nn.modules.utils import _pair
    15. from eval.function import batched_nms
    16. from eval.orienmask_yolo_postprocess import OrienMaskYOLOPostProcess
    17. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    18. import os
    19. envpath = '/home/ubuntu/.local/lib/python3.8/site-packages/cv2/qt/plugins/platforms'
    20. os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = envpath
    21. def pad(image, size_divisor=32, pad_value=0):
    22. height, width = image.shape[-2:]
    23. new_height = int(math.ceil(height / size_divisor) * size_divisor)
    24. new_width = int(math.ceil(width / size_divisor) * size_divisor)
    25. pad_left, pad_top = (new_width - width) // 2, (new_height - height) // 2
    26. pad_right, pad_down = new_width - width - pad_left, new_height - height - pad_top
    27. padding = [pad_left, pad_right, pad_top, pad_down]
    28. image = F.pad(image, padding, value=pad_value)
    29. pad_info = padding + [new_height, new_width]
    30. return image, pad_info
    31. def torch2onnx(args, model):
    32. import datetime
    33. start_load_data = datetime.datetime.now()
    34. img_src=cv2.imread(args.img)
    35. img_color = cv2.cvtColor(img_src, cv2.COLOR_BGR2RGB)
    36. src_tensor = torch.tensor(img_color, device=device,dtype=torch.float32)
    37. img_resize = cv2.resize(img_color, (544, 544),cv2.INTER_LINEAR)
    38. input = np.transpose(img_resize, (2, 0, 1)).astype(np.float32)
    39. input[0, ...] = (input[0, ...] - 0) / 255 # la
    40. input[1, ...] = (input[1, ...] - 0) / 255
    41. input[2, ...] = (input[2, ...] - 0) / 255
    42. now_image= Variable(torch.from_numpy(input))
    43. dummy_input = now_image.unsqueeze(0).to(device)
    44. dummy_input, pad_info = pad(dummy_input)
    45. end_load_data = datetime.datetime.now()
    46. print("load data:", (end_load_data - start_load_data).microseconds / 1000, "ms")
    47. start_convert = datetime.datetime.now()
    48. torch.onnx.export(model, dummy_input, args.onnx_model_path, input_names=["input"],
    49. export_params=True,
    50. keep_initializers_as_inputs=True,
    51. do_constant_folding=True,
    52. verbose=False,
    53. opset_version=11)
    54. end_convert = datetime.datetime.now()
    55. print("convert model:", (end_convert - start_convert).microseconds / 1000, "ms")
    56. start_load = datetime.datetime.now()
    57. example_model = get_example(args.onnx_model_path)
    58. end_load = datetime.datetime.now()
    59. print("load model:", (end_load - start_load).microseconds / 1000, "ms")
    60. start_Forward = datetime.datetime.now()
    61. session = onnxruntime.InferenceSession(example_model)
    62. input_name = session.get_inputs()[0].name
    63. result = session.run([], {input_name: dummy_input.data.cpu().numpy()})
    64. result_tuple=((torch.tensor(result[0],device=device),torch.tensor(result[1],device=device)),
    65. (torch.tensor(result[2],device=device),torch.tensor(result[3],device=device)),
    66. (torch.tensor(result[4],device=device),torch.tensor(result[5],device=device)))
    67. pred_bbox_batch=[torch.tensor(result[0],device=device),torch.tensor(result[2],device=device),torch.tensor(result[4],device=device)]
    68. pred_orien_batch=[torch.tensor(result[6],device=device),torch.tensor(result[7],device=device),torch.tensor(result[8],device=device)]
    69. self_grid_size = [[17, 17], [34, 34], [68, 68]]
    70. self_image_size = [544, 544]
    71. self_anchors = [[12, 16], [19, 36], [40, 28], [36, 75], [76, 55], [72, 146], [142, 110], [192, 243], [459, 401]]
    72. self_anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    73. self_num_classes = 80
    74. self_conf_thresh = 0.05
    75. self_nms_func = None
    76. self_nms_pre = 400
    77. self_nms_post = 100
    78. self_orien_thresh = 0.3
    79. item_Orien=OrienMaskYOLOPostProcess(self_grid_size, self_image_size, self_anchors, self_anchor_mask, self_num_classes,
    80. self_conf_thresh, self_nms_func, self_nms_pre,
    81. self_nms_post, self_orien_thresh, device)
    82. predictions =item_Orien.apply(result_tuple,pred_bbox_batch,pred_orien_batch)
    83. end_Forward = datetime.datetime.now()
    84. print("Forward & Postprocess:", (end_Forward - start_Forward).microseconds / 1000, "ms")
    85. start_visual = datetime.datetime.now()
    86. dataset='COCO'
    87. with_mask=True
    88. conf_thresh=0.3
    89. alpha=0.6
    90. line_thickness=1
    91. ifer_item=InferenceVisualizer(dataset,device, with_mask,conf_thresh,alpha,line_thickness)
    92. show_image = ifer_item.__call__(predictions[0], src_tensor,pad_info)
    93. plt.imsave(args.onnxoutput, show_image)
    94. end_visual =datetime.datetime.now()
    95. print("Visualize::", (end_visual - start_visual).microseconds/1000, "ms")
    96. def main():
    97. """Test a single image."""
    98. parser = ArgumentParser()
    99. parser.add_argument('--img', default="/home/ubuntu/OrienMask/assets/000000163126.jpg",
    100. help='Image file')
    101. parser.add_argument('--weights', default="/home/ubuntu/CLionProjects/model/orienmask_yolo.pth",
    102. help='Checkpoint file')
    103. parser.add_argument('--onnx_model_path',
    104. default="/home/ubuntu/CLionProjects/model/orienmask_yolo.onnx",
    105. help='onnx_model_path')
    106. parser.add_argument('--device', default='cuda:0', help='Device used for inference')
    107. parser.add_argument('--onnxoutput', default=r'onnxsxj731533730.jpg', help='Output image')
    108. parser.add_argument('--num_anchors', type=int, default=3, help='num_anchors')
    109. parser.add_argument('--num_classes', type=int, default=80, help='num_classes')
    110. args = parser.parse_args()
    111. model=OrienMaskYOLOFPNPlus(args.num_anchors,args.num_classes).to(device)
    112. weights = torch.load(args.weights, map_location=device)
    113. weights = weights['state_dict'] if 'state_dict' in weights else weights
    114. model.load_state_dict(weights, strict=True)
    115. torch2onnx(args, model)
    116. if __name__ == '__main__':
    117. main()

    第一步:转模型Developer Guide :: NVIDIA Deep Learning TensorRT Documentation

    1. import tensorrt as trt
    2. def build_engine(onnx_file_path,engine_file_path,half=False):
    3. """Takes an ONNX file and creates a TensorRT engine to run inference with"""
    4. logger = trt.Logger(trt.Logger.INFO)
    5. builder = trt.Builder(logger)
    6. config = builder.create_builder_config()
    7. config.max_workspace_size = 4 * 1 << 30
    8. flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    9. network = builder.create_network(flag)
    10. parser = trt.OnnxParser(network, logger)
    11. if not parser.parse_from_file(str(onnx_file_path)):
    12. raise RuntimeError(f'failed to load ONNX file: {onnx_file_path}')
    13. half &= builder.platform_has_fast_fp16
    14. if half:
    15. config.set_flag(trt.BuilderFlag.FP16)
    16. with builder.build_engine(network, config) as engine, open(engine_file_path, 'wb') as t:
    17. t.write(engine.serialize())
    18. return engine_file_path
    19. if __name__ =="__main__":
    20. onnx_file_path = "/home/ubuntu/CLionProjects/D435_OrienMask/model/orienmask_yolo_sim.onnx"
    21. engine_file_path = "/home/ubuntu/CLionProjects/D435_OrienMask/model/orienmask_yolo_sim.engine"
    22. build_engine(onnx_file_path,engine_file_path,True)

    转换结果 注意 如果提示空间不够,需要修改配置项,将30 改小一点

     config.max_workspace_size = 4 * 1 << 30

    转换过程

    1. /usr/bin/python3.8 /home/ubuntu/OrienMask/onnx2trt.py
    2. [TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.3.0 but loaded cuBLAS/cuBLAS LT 11.2.1
    3. [TensorRT] INFO: Some tactics do not have sufficient workspace memory to run. Increasing workspace size may increase performance, please check verbose output.
    4. [TensorRT] INFO: Detected 1 inputs and 9 output network tensors.
    5. [TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.3.0 but loaded cuBLAS/cuBLAS LT 11.2.1
    6. Process finished with exit code 0

    测试结果数据比对

    1. import tensorrt as trt
    2. import pycuda.driver as cuda
    3. import pycuda.autoinit
    4. from pycuda.tools import make_default_context
    5. import torch
    6. import numpy as np
    7. import math
    8. import torch.nn.functional as F
    9. import cv2
    10. from torch.autograd import Variable
    11. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    12. def load_engine(engine_path):
    13. # TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # INFO
    14. TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
    15. with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
    16. return runtime.deserialize_cuda_engine(f.read())
    17. def pad(image, size_divisor=32, pad_value=0):
    18. height, width = image.shape[-2:]
    19. new_height = int(math.ceil(height / size_divisor) * size_divisor)
    20. new_width = int(math.ceil(width / size_divisor) * size_divisor)
    21. pad_left, pad_top = (new_width - width) // 2, (new_height - height) // 2
    22. pad_right, pad_down = new_width - width - pad_left, new_height - height - pad_top
    23. padding = [pad_left, pad_right, pad_top, pad_down]
    24. image = F.pad(image, padding, value=pad_value)
    25. pad_info = padding + [new_height, new_width]
    26. return image, pad_info
    27. img_src = cv2.imread('/home/ubuntu/OrienMask/assets/000000163126.jpg')
    28. img_color = cv2.cvtColor(img_src, cv2.COLOR_BGR2RGB)
    29. img_resize = cv2.resize(img_color, (544, 544), cv2.INTER_LINEAR)
    30. input = np.transpose(img_resize, (2, 0, 1)).astype(np.float32)
    31. input[0, ...] = (input[0, ...] - 0) / 255 # la
    32. input[1, ...] = (input[1, ...] - 0) / 255
    33. input[2, ...] = (input[2, ...] - 0) / 255
    34. now_image = Variable(torch.from_numpy(input))
    35. dummy_input = now_image.unsqueeze(0)
    36. dummy_input, pad_info = pad(dummy_input)
    37. image=np.array(dummy_input.contiguous())
    38. path = "/home/ubuntu/CLionProjects/D435_OrienMask/model/orienmask_yolo_sim.engine"
    39. # 1. 建立模型,构建上下文管理器
    40. engine = load_engine(path)
    41. context = engine.create_execution_context()
    42. context.active_optimization_profile = 0
    43. # 3.分配内存空间,并进行数据cpu到gpu的拷贝
    44. # 动态尺寸,每次都要set一下模型输入的shape,0代表的就是输入,输出根据具体的网络结构而定,可以是0,1,2,3...其中的某个头。
    45. context.set_binding_shape(0, image.shape)
    46. d_input = cuda.mem_alloc(image.nbytes) # 分配输入的内存。
    47. output_shape_1 = context.get_binding_shape(1)
    48. output_shape_2 = context.get_binding_shape(2)
    49. output_shape_3 = context.get_binding_shape(3)
    50. output_shape_4 = context.get_binding_shape(4)
    51. output_shape_5 = context.get_binding_shape(5)
    52. output_shape_6 = context.get_binding_shape(6)
    53. output_shape_7 = context.get_binding_shape(7)
    54. output_shape_8 = context.get_binding_shape(8)
    55. output_shape_9 = context.get_binding_shape(9)
    56. buffer_1 = np.empty(output_shape_1, dtype=np.float32)
    57. buffer_2 = np.empty(output_shape_2, dtype=np.float32)
    58. buffer_3 = np.empty(output_shape_3, dtype=np.float32)
    59. buffer_4 = np.empty(output_shape_4, dtype=np.float32)
    60. buffer_5 = np.empty(output_shape_5, dtype=np.float32)
    61. buffer_6 = np.empty(output_shape_6, dtype=np.float32)
    62. buffer_7 = np.empty(output_shape_7, dtype=np.float32)
    63. buffer_8 = np.empty(output_shape_8, dtype=np.float32)
    64. buffer_9 = np.empty(output_shape_9, dtype=np.float32)
    65. d_output_1 = cuda.mem_alloc(buffer_1.nbytes) # 分配输出内存。
    66. d_output_2 = cuda.mem_alloc(buffer_2.nbytes) # 分配输出内存
    67. d_output_3 = cuda.mem_alloc(buffer_3.nbytes) # 分配输出内存
    68. d_output_4 = cuda.mem_alloc(buffer_4.nbytes) # 分配输出内存
    69. d_output_5 = cuda.mem_alloc(buffer_5.nbytes) # 分配输出内存
    70. d_output_6 = cuda.mem_alloc(buffer_6.nbytes) # 分配输出内存
    71. d_output_7 = cuda.mem_alloc(buffer_7.nbytes) # 分配输出内存
    72. d_output_8 = cuda.mem_alloc(buffer_8.nbytes) # 分配输出内存
    73. d_output_9 = cuda.mem_alloc(buffer_9.nbytes) # 分配输出内存
    74. cuda.memcpy_htod(d_input, image)
    75. bindings = [d_input, d_output_1,d_output_2,d_output_3,d_output_4,d_output_5,d_output_6,d_output_7,d_output_8,d_output_9]
    76. # 4.进行推理,并将结果从gpu拷贝到cpu。
    77. context.execute_v2(bindings) # 可异步和同步
    78. cuda.memcpy_dtoh(buffer_1, d_output_1)
    79. output_1 = buffer_1.reshape(output_shape_1)
    80. print(output_1.shape)
    81. cuda.memcpy_dtoh(buffer_2, d_output_2)
    82. output_2 = buffer_2.reshape(output_shape_2)
    83. print(output_2.shape)
    84. cuda.memcpy_dtoh(buffer_3, d_output_3)
    85. output_3 = buffer_3.reshape(output_shape_3)
    86. print(output_3.shape)
    87. cuda.memcpy_dtoh(buffer_4, d_output_4)
    88. output_4 = buffer_4.reshape(output_shape_4)
    89. print(output_4.shape)
    90. cuda.memcpy_dtoh(buffer_5, d_output_5)
    91. output_5 = buffer_5.reshape(output_shape_5)
    92. print(output_5.shape)
    93. cuda.memcpy_dtoh(buffer_6, d_output_6)
    94. output_6 = buffer_6.reshape(output_shape_6)
    95. print(output_6.shape)
    96. cuda.memcpy_dtoh(buffer_7, d_output_7)
    97. output_7 = buffer_7.reshape(output_shape_7)
    98. print(output_7.shape)
    99. cuda.memcpy_dtoh(buffer_8, d_output_8)
    100. output_8 = buffer_8.reshape(output_shape_8)
    101. print(output_8.shape)
    102. cuda.memcpy_dtoh(buffer_9, d_output_9)
    103. output_9 = buffer_9.reshape(output_shape_9)
    104. print(output_9.shape)

    数据比对onnx和tengine是一致的,onnx的数据 

    engine的数据

     测试一下时间,只比较推理时间,后处理一致 engine完整的推理代码(含转模型)

    1. import datetime
    2. import tensorrt as trt
    3. import matplotlib.pyplot as plt
    4. import pycuda.driver as cuda
    5. import pycuda.autoinit
    6. from pycuda.tools import make_default_context
    7. import torch
    8. import numpy as np
    9. import math
    10. import torch.nn.functional as F
    11. import cv2
    12. from torch.autograd import Variable
    13. from argparse import ArgumentParser
    14. from eval.orienmask_yolo_postprocess import OrienMaskYOLOPostProcess
    15. from utils.visualizer import InferenceVisualizer
    16. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    17. def build_engine(onnx_file_path, engine_file_path, half=False):
    18. """Takes an ONNX file and creates a TensorRT engine to run inference with"""
    19. logger = trt.Logger(trt.Logger.INFO)
    20. builder = trt.Builder(logger)
    21. config = builder.create_builder_config()
    22. config.max_workspace_size = 4 * 1 << 20
    23. flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    24. network = builder.create_network(flag)
    25. parser = trt.OnnxParser(network, logger)
    26. if not parser.parse_from_file(str(onnx_file_path)):
    27. raise RuntimeError(f'failed to load ONNX file: {onnx_file_path}')
    28. half &= builder.platform_has_fast_fp16
    29. if half:
    30. config.set_flag(trt.BuilderFlag.FP16)
    31. with builder.build_engine(network, config) as engine, open(engine_file_path, 'wb') as t:
    32. t.write(engine.serialize())
    33. return engine_file_path
    34. def load_engine(engine_path):
    35. # TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # INFO
    36. TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
    37. with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
    38. return runtime.deserialize_cuda_engine(f.read())
    39. def pad(image, size_divisor=32, pad_value=0):
    40. height, width = image.shape[-2:]
    41. new_height = int(math.ceil(height / size_divisor) * size_divisor)
    42. new_width = int(math.ceil(width / size_divisor) * size_divisor)
    43. pad_left, pad_top = (new_width - width) // 2, (new_height - height) // 2
    44. pad_right, pad_down = new_width - width - pad_left, new_height - height - pad_top
    45. padding = [pad_left, pad_right, pad_top, pad_down]
    46. image = F.pad(image, padding, value=pad_value)
    47. pad_info = padding + [new_height, new_width]
    48. return image, pad_info
    49. def onnx2engine(args):
    50. start_convert = datetime.datetime.now()
    51. build_engine(args.onnx_model_path, args.engine_file_path, args.fp16)
    52. end_convert = datetime.datetime.now()
    53. print("convert model:", (end_convert - start_convert).microseconds / 1000, "ms")
    54. start_load_data = datetime.datetime.now()
    55. img_src = cv2.imread(args.img)
    56. img_color = cv2.cvtColor(img_src, cv2.COLOR_BGR2RGB)
    57. img_resize = cv2.resize(img_color, (544, 544), cv2.INTER_LINEAR)
    58. input = np.transpose(img_resize, (2, 0, 1)).astype(np.float32)
    59. input[0, ...] = (input[0, ...] - 0) / 255 # la
    60. input[1, ...] = (input[1, ...] - 0) / 255
    61. input[2, ...] = (input[2, ...] - 0) / 255
    62. now_image = Variable(torch.from_numpy(input))
    63. dummy_input = now_image.unsqueeze(0)
    64. dummy_input, pad_info = pad(dummy_input)
    65. image = np.array(dummy_input.contiguous())
    66. end_load_data = datetime.datetime.now()
    67. print("load data:", (end_load_data - start_load_data).microseconds / 1000, "ms")
    68. start_load = datetime.datetime.now()
    69. # 1. 建立模型,构建上下文管理器
    70. engine = load_engine(args.engine_file_path)
    71. end_load = datetime.datetime.now()
    72. print("load model:", (end_load - start_load).microseconds / 1000, "ms")
    73. start_Forward = datetime.datetime.now()
    74. context = engine.create_execution_context()
    75. context.active_optimization_profile = 0
    76. # 3.分配内存空间,并进行数据cpu到gpu的拷贝
    77. # 动态尺寸,每次都要set一下模型输入的shape,0代表的就是输入,输出根据具体的网络结构而定,可以是0,1,2,3...其中的某个头。
    78. context.set_binding_shape(0, image.shape)
    79. d_input = cuda.mem_alloc(image.nbytes) # 分配输入的内存。
    80. output_shape_1 = context.get_binding_shape(1)
    81. output_shape_2 = context.get_binding_shape(2)
    82. output_shape_3 = context.get_binding_shape(3)
    83. output_shape_4 = context.get_binding_shape(4)
    84. output_shape_5 = context.get_binding_shape(5)
    85. output_shape_6 = context.get_binding_shape(6)
    86. output_shape_7 = context.get_binding_shape(7)
    87. output_shape_8 = context.get_binding_shape(8)
    88. output_shape_9 = context.get_binding_shape(9)
    89. buffer_1 = np.empty(output_shape_1, dtype=np.float32)
    90. buffer_2 = np.empty(output_shape_2, dtype=np.float32)
    91. buffer_3 = np.empty(output_shape_3, dtype=np.float32)
    92. buffer_4 = np.empty(output_shape_4, dtype=np.float32)
    93. buffer_5 = np.empty(output_shape_5, dtype=np.float32)
    94. buffer_6 = np.empty(output_shape_6, dtype=np.float32)
    95. buffer_7 = np.empty(output_shape_7, dtype=np.float32)
    96. buffer_8 = np.empty(output_shape_8, dtype=np.float32)
    97. buffer_9 = np.empty(output_shape_9, dtype=np.float32)
    98. d_output_1 = cuda.mem_alloc(buffer_1.nbytes) # 分配输出内存。
    99. d_output_2 = cuda.mem_alloc(buffer_2.nbytes) # 分配输出内存
    100. d_output_3 = cuda.mem_alloc(buffer_3.nbytes) # 分配输出内存
    101. d_output_4 = cuda.mem_alloc(buffer_4.nbytes) # 分配输出内存
    102. d_output_5 = cuda.mem_alloc(buffer_5.nbytes) # 分配输出内存
    103. d_output_6 = cuda.mem_alloc(buffer_6.nbytes) # 分配输出内存
    104. d_output_7 = cuda.mem_alloc(buffer_7.nbytes) # 分配输出内存
    105. d_output_8 = cuda.mem_alloc(buffer_8.nbytes) # 分配输出内存
    106. d_output_9 = cuda.mem_alloc(buffer_9.nbytes) # 分配输出内存
    107. cuda.memcpy_htod(d_input, image)
    108. bindings = [d_input, d_output_1, d_output_2, d_output_3, d_output_4, d_output_5, d_output_6, d_output_7, d_output_8,
    109. d_output_9]
    110. # 4.进行推理,并将结果从gpu拷贝到cpu。
    111. context.execute_v2(bindings) # 可异步和同步
    112. cuda.memcpy_dtoh(buffer_1, d_output_1)
    113. output_1 = buffer_1.reshape(output_shape_1)
    114. print(output_1.shape)
    115. cuda.memcpy_dtoh(buffer_2, d_output_2)
    116. output_2 = buffer_2.reshape(output_shape_2)
    117. print(output_2.shape)
    118. cuda.memcpy_dtoh(buffer_3, d_output_3)
    119. output_3 = buffer_3.reshape(output_shape_3)
    120. print(output_3.shape)
    121. cuda.memcpy_dtoh(buffer_4, d_output_4)
    122. output_4 = buffer_4.reshape(output_shape_4)
    123. print(output_4.shape)
    124. cuda.memcpy_dtoh(buffer_5, d_output_5)
    125. output_5 = buffer_5.reshape(output_shape_5)
    126. print(output_5.shape)
    127. cuda.memcpy_dtoh(buffer_6, d_output_6)
    128. output_6 = buffer_6.reshape(output_shape_6)
    129. print(output_6.shape)
    130. cuda.memcpy_dtoh(buffer_7, d_output_7)
    131. output_7 = buffer_7.reshape(output_shape_7)
    132. print(output_7.shape)
    133. cuda.memcpy_dtoh(buffer_8, d_output_8)
    134. output_8 = buffer_8.reshape(output_shape_8)
    135. print(output_8.shape)
    136. cuda.memcpy_dtoh(buffer_9, d_output_9)
    137. output_9 = buffer_9.reshape(output_shape_9)
    138. print(output_9.shape)
    139. result_tuple = ((torch.tensor(output_1, device=device), torch.tensor(output_4, device=device)),
    140. (torch.tensor(output_2, device=device), torch.tensor(output_5, device=device)),
    141. (torch.tensor(output_3, device=device), torch.tensor(output_6, device=device)))
    142. pred_bbox_batch = [torch.tensor(output_1, device=device), torch.tensor(output_2, device=device),
    143. torch.tensor(output_3, device=device)]
    144. pred_orien_batch = [torch.tensor(output_7, device=device), torch.tensor(output_8, device=device),
    145. torch.tensor(output_9, device=device)]
    146. self_grid_size = [[17, 17], [34, 34], [68, 68]]
    147. self_image_size = [544, 544]
    148. self_anchors = [[12, 16], [19, 36], [40, 28], [36, 75], [76, 55], [72, 146], [142, 110], [192, 243], [459, 401]]
    149. self_anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    150. self_num_classes = 80
    151. self_conf_thresh = 0.05
    152. self_nms_func = None
    153. self_nms_pre = 400
    154. self_nms_post = 100
    155. self_orien_thresh = 0.3
    156. item_Orien = OrienMaskYOLOPostProcess(self_grid_size, self_image_size, self_anchors, self_anchor_mask,
    157. self_num_classes,
    158. self_conf_thresh, self_nms_func, self_nms_pre,
    159. self_nms_post, self_orien_thresh, device)
    160. predictions = item_Orien.apply(result_tuple, pred_bbox_batch, pred_orien_batch)
    161. end_Forward = datetime.datetime.now()
    162. print("Forward & Postprocess:", (end_Forward - start_Forward).microseconds / 1000, "ms")
    163. start_visual = datetime.datetime.now()
    164. dataset = 'COCO'
    165. with_mask = True
    166. conf_thresh = 0.3
    167. alpha = 0.6
    168. line_thickness = 1
    169. ifer_item = InferenceVisualizer(dataset, device, with_mask, conf_thresh, alpha, line_thickness)
    170. show_image = ifer_item.__call__(predictions[0], torch.tensor(img_color, device=device, dtype=torch.float32),
    171. pad_info)
    172. plt.imsave(args.engineoutput, show_image)
    173. end_visual = datetime.datetime.now()
    174. print("Visualize::", (end_visual - start_visual).microseconds / 1000, "ms")
    175. def main():
    176. """Test a single image."""
    177. parser = ArgumentParser()
    178. parser.add_argument('--img', default="/home/ubuntu/OrienMask/assets/000000163126.jpg",
    179. help='Image file')
    180. parser.add_argument('--onnx_model_path',
    181. default="/home/ubuntu/OrienMask/checkpoints/orienmask_yolo.onnx",
    182. help='onnx_model_path')
    183. parser.add_argument('--engine_file_path',
    184. default="/home/ubuntu/OrienMask/checkpoints/orienmask_yolo.engine",
    185. help='Checkpoint file')
    186. parser.add_argument('--fp16', default=False, help='Device used for inference')
    187. parser.add_argument('--device', default='cuda:0', help='Device used for inference')
    188. parser.add_argument('--engineoutput', default=r'enginesxj731533730.jpg', help='Output image')
    189. parser.add_argument('--num_anchors', type=int, default=3, help='num_anchors')
    190. parser.add_argument('--num_classes', type=int, default=80, help='num_classes')
    191. args = parser.parse_args()
    192. onnx2engine(args)
    193. if __name__ == '__main__':
    194. main()

    python 对应infer.py pt模型时间

    1. [406, 194, 623, 435] 0.9981988668441772 cup
    2. [114, 96, 401, 458] 0.9951307773590088 cup
    3. [399, 292, 450, 343] 0.8919668793678284 baseball-glove
    4. [379, 61, 531, 178] 0.7934483289718628 baseball-bat
    5. The inference takes 1.511667236328125 seconds.
    6. The average inference time is 1511.67 ms (0.66 fps)
    7. Load data: 117.35ms (8.52fps)
    8. Forward & Postprocess: 1329.60ms (0.75fps)
    9. Visualize: 11.64ms (85.92fps)
    10. 100%|██████████| 1/1 [00:01<00:00, 1.46s/it]

    onnx的时间 

    1. /usr/bin/python3.8 /home/ubuntu/OrienMask/pytorch2onnx.py
    2. load data: 27.229 ms
    3. convert model: 472.885 ms
    4. load model: 0.066 ms
    5. Forward & Postprocess: 364.452 ms
    6. [406, 194, 623, 435] 0.9981685876846313 cup
    7. [114, 96, 401, 458] 0.9952691793441772 cup
    8. [399, 292, 450, 343] 0.8922784328460693 baseball-glove
    9. [379, 61, 531, 178] 0.7953709363937378 baseball-bat
    10. Visualize:: 14.279 ms

    tensorRT的时间fp32

    1. /usr/bin/python3.8 /home/ubuntu/OrienMask/onn2trt.py
    2. [TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.3.0 but loaded cuBLAS/cuBLAS LT 11.2.1
    3. [TensorRT] INFO: Some tactics do not have sufficient workspace memory to run. Increasing workspace size may increase performance, please check verbose output.
    4. [TensorRT] INFO: Detected 1 inputs and 9 output network tensors.
    5. [TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.3.0 but loaded cuBLAS/cuBLAS LT 11.2.1
    6. convert model: 112.855 ms
    7. load data: 11.616 ms
    8. load model: 871.787 ms
    9. (1, 255, 17, 17)
    10. (1, 255, 34, 34)
    11. (1, 255, 68, 68)
    12. (1, 6, 136, 136)
    13. (1, 6, 136, 136)
    14. (1, 6, 136, 136)
    15. (1, 6, 544, 544)
    16. (1, 6, 544, 544)
    17. (1, 6, 544, 544)
    18. excute onnx infer
    19. Forward & Postprocess: 932.863 ms
    20. [406, 194, 623, 435] 0.9981685876846313 cup
    21. [114, 96, 401, 458] 0.9952691793441772 cup
    22. [399, 292, 450, 343] 0.892278790473938 baseball-glove
    23. [379, 61, 531, 178] 0.7953709363937378 baseball-bat
    24. Visualize:: 280.247 ms
    25. Process finished with exit code 0

    tensorRT的时间fp16

    1. /usr/bin/python3.8 /home/ubuntu/OrienMask/onn2trt.py
    2. [TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.3.0 but loaded cuBLAS/cuBLAS LT 11.2.1
    3. [TensorRT] INFO: Some tactics do not have sufficient workspace memory to run. Increasing workspace size may increase performance, please check verbose output.
    4. [TensorRT] INFO: Detected 1 inputs and 9 output network tensors.
    5. [TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.3.0 but loaded cuBLAS/cuBLAS LT 11.2.1
    6. convert model: 21.035 ms
    7. load data: 10.628 ms
    8. load model: 887.876 ms
    9. (1, 255, 17, 17)
    10. (1, 255, 34, 34)
    11. (1, 255, 68, 68)
    12. (1, 6, 136, 136)
    13. (1, 6, 136, 136)
    14. (1, 6, 136, 136)
    15. (1, 6, 544, 544)
    16. (1, 6, 544, 544)
    17. (1, 6, 544, 544)
    18. excute onnx infer
    19. Forward & Postprocess: 919.326 ms
    20. [406, 194, 623, 435] 0.9981685876846313 cup
    21. [114, 96, 401, 458] 0.9952691793441772 cup
    22. [399, 292, 450, 343] 0.892278790473938 baseball-glove
    23. [379, 61, 531, 178] 0.7953709363937378 baseball-bat
    24. Visualize:: 53.748 ms
    25. Process finished with exit code 0

    感觉我显卡太拉跨了,还不如onnx快....

    c++代码待学习补充

    参考:

     Documentation Archives :: NVIDIA Deep Learning TensorRT Documentation

  • 相关阅读:
    百度文心一言GPT免费入口也来了!!!
    郑卢高速洛阳至洛宁段路基路面综合设计K14+000-K15+400设计计算书+cad图纸
    Mongodb索引的创建与命名
    真的,Java并发编程基础入门看这个就够了
    1.6 列表(Python)
    对比SQL学习power bi--(1)分组求和后,再平均!
    毕业设计之基于node.js+Vue+Element驾校信息管理系统
    ESB(企业服务总线)
    二十三、生成帮助文档
    电力电子转战数字IC20220729day59——uvm入门实验1
  • 原文地址:https://blog.csdn.net/sxj731533730/article/details/126118443