• cuMemcpyHtoDAsync failed: invalid argument


    tenorrt运行报错:

    pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
    -------------------------------------------------------------------
    PyCUDA ERROR: The context stack was not empty upon module cleanup.
    -------------------------------------------------------------------
    A context was still active when the context stack was being
    cleaned up. At this point in our execution, CUDA may already
    have been deinitialized, so there is no way we can finish
    cleanly. The program will be aborted now.
    Use Context.pop() to avoid this problem.
    -------------------------------------------------------------------

    tensorrt 推理代码:

    1. import sys
    2. sys.path.append('../../tools/')
    3. import cv2
    4. import time
    5. import numpy as np
    6. import tensorrt as trt
    7. import pycuda.driver as cuda
    8. import pycuda.autoinit
    9. print('trt version',trt.__version__)
    10. TRT_LOGGER = trt.Logger()
    11. class HostDeviceMem(object):
    12. def __init__(self, host_mem, device_mem):
    13. self.host = host_mem
    14. self.device = device_mem
    15. def __str__(self):
    16. return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
    17. def __repr__(self):
    18. return self.__str__()
    19. # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
    20. def allocate_buffers(engine, context):
    21. inputs = []
    22. outputs = []
    23. bindings = []
    24. stream = cuda.Stream()
    25. for i, binding in enumerate(engine):
    26. size = trt.volume(context.get_binding_shape(i))
    27. dtype = trt.nptype(engine.get_binding_dtype(binding))
    28. # Allocate host and device buffers
    29. host_mem = cuda.pagelocked_empty(size, dtype)
    30. device_mem = cuda.mem_alloc(host_mem.nbytes)
    31. # Append the device buffer to device bindings.
    32. bindings.append(int(device_mem))
    33. # Append to the appropriate list.
    34. if engine.binding_is_input(binding):
    35. inputs.append(HostDeviceMem(host_mem, device_mem))
    36. else:
    37. outputs.append(HostDeviceMem(host_mem, device_mem))
    38. return inputs, outputs, bindings, stream
    39. # This function is generalized for multiple inputs/outputs.
    40. # inputs and outputs are expected to be lists of HostDeviceMem objects.
    41. def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    42. # Transfer input data to the GPU.
    43. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    44. # Run inference.
    45. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    46. # Transfer predictions back from the GPU.
    47. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    48. # Synchronize the stream
    49. stream.synchronize()
    50. # Return only the host outputs.
    51. return [out.host for out in outputs]
    52. # 用numpy重写softmax
    53. def softmax(out_np, dim):
    54. s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
    55. return s_value
    56. class FaceClassify(object):
    57. def __init__(self, configs):
    58. self.engine_path = configs.face_classify_engine
    59. self.input_size = configs.classify_input_size
    60. self.image_size = self.input_size
    61. self.MEAN = configs.classify_mean
    62. self.STD = configs.classify_std
    63. self.engine = self.get_engine()
    64. self.context = self.engine.create_execution_context()
    65. def get_engine(self):
    66. # If a serialized engine exists, use it instead of building an engine.
    67. f = open(self.engine_path, 'rb')
    68. runtime = trt.Runtime(TRT_LOGGER)
    69. return runtime.deserialize_cuda_engine(f.read())
    70. def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
    71. cuda_ctx.push()
    72. start_all=time.time()
    73. IN_IMAGE_H, IN_IMAGE_W = self.image_size
    74. # Input
    75. img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
    76. img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
    77. img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) # (3, 240, 240)
    78. img_in /= 255.0 # 归一化[0, 1]
    79. # mean = (0.485, 0.456, 0.406)
    80. mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
    81. mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
    82. mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
    83. mean = np.concatenate((mean0, mean1, mean2), axis=0)
    84. # std = (0.229, 0.224, 0.225)
    85. std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
    86. std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
    87. std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
    88. std = np.concatenate((std0, std1, std2), axis=0)
    89. img_in = ((img_in - mean) / std).astype(np.float32)
    90. img_in = np.expand_dims(img_in, axis=0) # (1, 3, 240, 240)
    91. img_in = np.ascontiguousarray(img_in)
    92. start=time.time()
    93. # 动态输入
    94. self.context.active_optimization_profile = 0
    95. origin_inputshape = self.context.get_binding_shape(0)
    96. origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
    97. self.context.set_binding_shape(0, (origin_inputshape)) # 若每个输入的size不一样,可根据inputs的size更改对应的context中的size
    98. inputs, outputs, bindings, stream = allocate_buffers(self.engine, self.context)
    99. # Do inference
    100. inputs[0].host = img_in
    101. trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
    102. stream=stream, batch_size=1)
    103. print('infer time',time.time()-start,trt_outputs)
    104. if cuda_ctx:
    105. cuda_ctx.pop()
    106. labels_sm = softmax(trt_outputs, dim=0)
    107. labels_max = np.argmax(labels_sm, axis=1)
    108. print('time_a',time.time()-start_all)
    109. return labels_max.item() ,trt_outputs
    110. if __name__ == '__main__':
    111. class Params:
    112. pass
    113. opt = Params()
    114. opt.face_classify_engine = 'efficientnet_b1.trt'
    115. opt.classify_input_size = [128 ,128]
    116. opt.classify_mean = [0.5 ,0.5 ,0.5]
    117. opt.classify_std = [0.5 ,0.5 ,0.5]
    118. face =FaceClassify(opt)
    119. image_src =cv2.imread(r'987.jpg')
    120. # image_src =cv2.imread(r'F:\project\detect\yolov5\tensorrt\yolo-tensorrt_dll_trt8\sln\x64\Release\16_1.jpg')
    121. for i in range(10):
    122. labels_max ,trt_outputs =face.detect(image_src)
    123. print(trt_outputs)
    124. print(labels_max)

    原因,数据没有格式化为float32类型,

    解决方法:

    img_in = ((img_in - mean) / std).astype(np.float32)

    网友的答案也可以参考:

    我个人感觉的原因是输入的数据和模型数据入口所申请的地址不匹配:

    输入图片数据shape不对, 可能不是(N, C, H, W)
    输入图片数据的dtype不对 我是这种情况, 由于我是pytorch 转 ONNX 再转 tensorRT的, 在ONNX中的输入是不支持float64为,只支持单精度的数据格式, 而我自己在tensorRT里的输入没有这么转, 输入了float64的图片,所以报错, 把它改成float32 就稳了.

    原文链接:https://blog.csdn.net/GungnirsPledge/article/details/108428651

    文章也有解决方法。

  • 相关阅读:
    阶乘分解质因数
    暑假怎么保护孩子的眼睛?预防近视护眼灯推荐
    LED灯实验
    HTML小游戏6 —— 《高达战争》横版射击游戏(附完整源码)
    StackUp Erc4337 账户抽象实现分析
    字节序详细解读
    虚拟内存 & I/O & 零拷贝总结
    LeetCode - Medium - 63. Unique Paths II
    图像分类模型训练总结1
    java-php-net-python-银行招聘信息网计算机毕业设计程序
  • 原文地址:https://blog.csdn.net/jacke121/article/details/125904060