当前位置:   article > 正文

Tensorrt踩坑记 | python、pytorch 转 onnx 推理加速

tensorrt python多进程推理踩坑

来源:CSDN—makcooo

地址:https://blog.csdn.net/qq_44756223/article/details/107727863

01

配置

Ubuntu 16.04
python 3.6
onnx 1.6
pytorch 1.5
pycuda 2019.1.2
torchvision 0.1.8

建议详读,先安装好环境:

https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#import_onnx_python

02

步骤

1. 将pytorch模型转换成onnx模型

这边用的是Darknet生成的pytoch模型

  1. import torch
  2. from torch.autograd import Variable
  3. import onnx
  4. input_name = ['input']
  5. output_name = ['output']
  6. input = Variable(torch.randn(1, 3, 544, 544)).cuda()
  7. model = x.model.cuda()#x.model为我生成的模型
  8. # model = torch.load('', map_location="cuda:0")
  9. torch.onnx.export(model, input, 'model.onnx', input_names=input_name, output_names=output_name, verbose=True)

其中

  1. #model = x.model.cuda()
  2. #若是不添加cuda()
  3. model = x.model

出现报错

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

2. 检查模型

 
 
  1. model = onnx.load("model.onnx")
  2. onnx.checker.check_model(model)
  3. print("==> Passed")

3. 测试onnx模型使用tensorrt推理前后对比

 
 
  1. import pycuda.autoinit
  2. import numpy as np
  3. import pycuda.driver as cuda
  4. import tensorrt as trt
  5. import torch
  6. import os
  7. import time
  8. from PIL import Image
  9. import cv2
  10. import torchvision
  11. filename = '000000.jpg'
  12. max_batch_size = 1
  13. onnx_model_path = 'yolo.onnx'
  14. TRT_LOGGER = trt.Logger() # This logger is required to build an engine
  15. def get_img_np_nchw(filename):
  16. image = cv2.imread(filename)
  17. image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  18. image_cv = cv2.resize(image_cv, (1920, 1080))
  19. miu = np.array([0.485, 0.456, 0.406])
  20. std = np.array([0.229, 0.224, 0.225])
  21. img_np = np.array(image_cv, dtype=float) / 255.
  22. r = (img_np[:, :, 0] - miu[0]) / std[0]
  23. g = (img_np[:, :, 1] - miu[1]) / std[1]
  24. b = (img_np[:, :, 2] - miu[2]) / std[2]
  25. img_np_t = np.array([r, g, b])
  26. img_np_nchw = np.expand_dims(img_np_t, axis=0)
  27. return img_np_nchw
  28. class HostDeviceMem(object):
  29. def __init__(self, host_mem, device_mem):
  30. """Within this context, host_mom means the cpu memory and device means the GPU memory
  31. """
  32. self.host = host_mem
  33. self.device = device_mem
  34. def __str__(self):
  35. return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
  36. def __repr__(self):
  37. return self.__str__()
  38. def allocate_buffers(engine):
  39. inputs = []
  40. outputs = []
  41. bindings = []
  42. stream = cuda.Stream()
  43. for binding in engine:
  44. size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  45. dtype = trt.nptype(engine.get_binding_dtype(binding))
  46. # Allocate host and device buffers
  47. host_mem = cuda.pagelocked_empty(size, dtype)
  48. device_mem = cuda.mem_alloc(host_mem.nbytes)
  49. # Append the device buffer to device bindings.
  50. bindings.append(int(device_mem))
  51. # Append to the appropriate list.
  52. if engine.binding_is_input(binding):
  53. inputs.append(HostDeviceMem(host_mem, device_mem))
  54. else:
  55. outputs.append(HostDeviceMem(host_mem, device_mem))
  56. return inputs, outputs, bindings, stream
  57. def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \
  58. fp16_mode=False, int8_mode=False, save_engine=False,
  59. ):
  60. """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
  61. def build_engine(max_batch_size, save_engine):
  62. """Takes an ONNX file and creates a TensorRT engine to run inference with"""
  63. EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
  64. with trt.Builder(TRT_LOGGER) as builder, \
  65. builder.create_network(EXPLICIT_BATCH) as network, \
  66. trt.OnnxParser(network, TRT_LOGGER) as parser:
  67. builder.max_workspace_size = 1 << 30 # Your workspace size
  68. builder.max_batch_size = max_batch_size
  69. # pdb.set_trace()
  70. builder.fp16_mode = fp16_mode # Default: False
  71. builder.int8_mode = int8_mode # Default: False
  72. if int8_mode:
  73. # To be updated
  74. raise NotImplementedError
  75. # Parse model file
  76. if not os.path.exists(onnx_file_path):
  77. quit('ONNX file {} not found'.format(onnx_file_path))
  78. print('Loading ONNX file from path {}...'.format(onnx_file_path))
  79. with open(onnx_file_path, 'rb') as model:
  80. print('Beginning ONNX file parsing')
  81. parser.parse(model.read())
  82. if not parser.parse(model.read()):
  83. for error in range(parser.num_errors):
  84. print(parser.get_error(error))
  85. print("===========Parsing fail!!!!=================")
  86. else :
  87. print('Completed parsing of ONNX file')
  88. print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
  89. engine = builder.build_cuda_engine(network)
  90. print("Completed creating Engine")
  91. if save_engine:
  92. with open(engine_file_path, "wb") as f:
  93. f.write(engine.serialize())
  94. return engine
  95. if os.path.exists(engine_file_path):
  96. # If a serialized engine exists, load it instead of building a new one.
  97. print("Reading engine from file {}".format(engine_file_path))
  98. with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
  99. return runtime.deserialize_cuda_engine(f.read())
  100. else:
  101. return build_engine(max_batch_size, save_engine)
  102. def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
  103. # Transfer data from CPU to the GPU.
  104. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
  105. # Run inference.
  106. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
  107. # Transfer predictions back from the GPU.
  108. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
  109. # Synchronize the stream
  110. stream.synchronize()
  111. # Return only the host outputs.
  112. return [out.host for out in outputs]
  113. def postprocess_the_outputs(h_outputs, shape_of_output):
  114. h_outputs = h_outputs.reshape(*shape_of_output)
  115. return h_outputs
  116. img_np_nchw = get_img_np_nchw(filename)
  117. img_np_nchw = img_np_nchw.astype(dtype=np.float32)
  118. # These two modes are dependent on hardwares
  119. fp16_mode = False
  120. int8_mode = False
  121. trt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode)
  122. # Build an engine
  123. engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode)
  124. # Create the context for this engine
  125. context = engine.create_execution_context()
  126. # Allocate buffers for input and output
  127. inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
  128. # Do inference
  129. shape_of_output = (max_batch_size, 1000)
  130. # Load data to the buffer
  131. inputs[0].host = img_np_nchw.reshape(-1)
  132. # inputs[1].host = ... for multiple input
  133. t1 = time.time()
  134. trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
  135. t2 = time.time()
  136. feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
  137. print('TensorRT ok')
  138. #将model改为自己的模型,此处为pytoch的resnet50,需联网下载
  139. model = torchvision.models.resnet50(pretrained=True).cuda()
  140. resnet_model = model.eval()
  141. input_for_torch = torch.from_numpy(img_np_nchw).cuda()
  142. t3 = time.time()
  143. feat_2= resnet_model(input_for_torch)
  144. t4 = time.time()
  145. feat_2 = feat_2.cpu().data.numpy()
  146. print('Pytorch ok!')
  147. mse = np.mean((feat - feat_2)**2)
  148. print("Inference time with the TensorRT engine: {}".format(t2-t1))
  149. print("Inference time with the PyTorch model: {}".format(t4-t3))
  150. print('MSE Error = {}'.format(mse))
  151. print('All completed!')

报错:

In node -1 (importModel): INVALID_VALUE: Assertion failed: !_importer_ctx.network()->hasImplicitBatchDimension() && "This version of the ONNX parser only supports TensorRT INetworkDefinitions with an explicit batch dimension. Please ensure the network was created using the EXPLICIT_BATCH NetworkDefinitionCreationFlag."

解决:

  1. def build_engine(max_batch_size, save_engine):
  2. EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
  3. with trt.Builder(TRT_LOGGER) as builder, \
  4. builder.create_network(EXPLICIT_BATCH) as network, \
  5.                 trt.OnnxParser(network, TRT_LOGGER) as parser:

报错:

  1. Traceback (most recent call last):
  2. line 126, in <listcomp>
  3. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
  4. pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument

解决:

 
 
  1. def get_img_np_nchw(filename):
  2. image = cv2.imread(filename)
  3. image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  4.     image_cv = cv2.resize(image_cv, (1920, 1080))

输入的检测图像尺寸需要resize成model的input的size

改为

  1. def get_img_np_nchw(filename):
  2. image = cv2.imread(filename)
  3. image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  4.     image_cv = cv2.resize(image_cv, (544,544))

报错

  1. line 139, in postprocess_the_outputs
  2. h_outputs = h_outputs.reshape(*shape_of_output)
  3. ValueError: cannot reshape array of size 5780 into shape (1,1000)

解决:

  1. #shape_of_output = (max_batch_size, 1000)
  2. #修改成自己模型ouput的大小
  3. shape_of_output = (1,20,17,17)
  1. 猜您喜欢:
  2.  戳我,查看GAN的系列专辑~!
  3. 一顿午饭外卖,成为CV视觉前沿弄潮儿!
  4. CVPR 2022 | 25+方向、最新50篇GAN论文
  5.  ICCV 2021 | 35个主题GAN论文汇总
  6. 110篇!CVPR 2021最全GAN论文梳理
  7. 100篇!CVPR 2020最全GAN论文梳理
  8. 拆解组新的GAN:解耦表征MixNMatch
  9. StarGAN第2版:多域多样性图像生成
  10. 附下载 | 《可解释的机器学习》中文版
  11. 附下载 |《TensorFlow 2.0 深度学习算法实战》
  12. 附下载 |《计算机视觉中的数学方法》分享
  13. 《基于深度学习的表面缺陷检测方法综述》
  14. 《零样本图像分类综述: 十年进展》
  15. 《基于深度神经网络的少样本学习综述》
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/57692
推荐阅读
相关标签
  

闽ICP备14008679号