当前位置:   article > 正文

pytorch模型转ONNX转TensorRT,模型转换和推理部署_unsupported_node: assertion failed: (inputs.at(0).

unsupported_node: assertion failed: (inputs.at(0).is_tensor())

一、pth模型转ONNX

  1. import os
  2. import sys
  3. import torch
  4. import numpy as np
  5. from feat.model import ResNet  # 导入自己的模型类
  6. def load_checkpoint(checkpoint_file, model):
  7. """Loads the checkpoint from the given file."""
  8. err_str = "Checkpoint '{}' not found"
  9. assert os.path.exists(checkpoint_file), err_str.format(checkpoint_file)
  10. checkpoint = torch.load(checkpoint_file, map_location="cpu")
  11.     return checkpoint["epoch"]
  12. if __name__ == '__main__':
  13. os.environ['CUDA_VISIBLE_DEVICES']='0'   # 设置运行显卡号
  14. model_filename='resnet_epoch_17.pyth'
  15. # init model
  16. model = ResNet()
  17. load_checkpoint(model_filename, model)
  18. model = model.cuda()
  19. model.eval()
  20. onnx_name = 'resnet.onnx'  # 输出onnx文件
  21. example = torch.randn((1,3,224,224))  # 模型输入大小
  22. example = example.cuda()
  23. input_names = ["input"]
  24. output_names = ["outputs"]
  25. dynamic_axes = {"input": {0: "batch_size"}, "outputs": {0: "batch_size"}}
  26.     
  27.     # 模型转换并保存
  28.     torch.onnx.export(model, example,onnx_name, opset_version=12, input_names=input_names, output_names=output_names, dynamic_axes=None)

二、测试ONNX模型精度

  1. import os
  2. import sys
  3. import torch
  4. import numpy as np
  5. import onnxruntime
  6. import time
  7. if __name__ == '__main__':
  8. os.environ['CUDA_VISIBLE_DEVICES']='0' # 设置运行显卡号
  9. model_filename='resnet_epoch_17.pyth'
  10. # init model
  11. model = ResNet()
  12. load_checkpoint(model_filename, model)
  13. model = model.cuda()
  14. model.eval()
  15.     session = onnxruntime.InferenceSession(onnx_name,providers=['CUDAExecutionProvider'])
  16.     img = np.random.randn(1,3,224,224).astype(np.float32) # 随机输出
  17.     t1 = time.time()
  18.     onnx_preds = session.run(None, {"input": img})
  19.     print("onnx preds result: ", onnx_preds)
  20.     t2 = time.time()
  21.     pth_preds = model(torch.from_numpy(img).cuda())
  22.     print("pth preds result: ", pth_preds)
  23.     t3 = time.time()
  24.     

对比打印结果,确认结果保持一致

  1. onnx preds res: [array([[-0.13128008, 0.04037811, 0.0529038 , 0.101323 , -0.03352938, [43/1903]
  2. 0.03099938, 0.06380229, -0.03544223, -0.03368076, 0.06361518,
  3. -0.00668521, -0.01996843, -0.0132075 , -0.03448019, 0.17793381,
  4. 0.08131739, 0.10232763, -0.09122676, 0.01173838, 0.03181053,
  5. -0.05899123, 0.01569226, -0.04734752, -0.12551421, 0.00686131,
  6. -0.00749457, -0.03729884, 0.05349742, 0.0304895 , 0.02956274,
  7. 0.00393172, 0.00196273, 0.01296113, -0.03985897, -0.06289426,
  8. -0.0825834 , -0.28903952, 0.02842386, -0.1718263 , -0.05555207,
  9. -0.03707219, 0.10904352, 0.06582819, 0.04960179, 0.01508415,
  10. 0.05469472, 0.28663486, 0.1183752 , -0.06070469, -0.05200525,
  11. -0.03477468, -0.06193898, -0.04432139, 0.0843045 , -0.12080704,
  12. 0.00163073, -0.08544722, 0.11994477, 0.02619292, 0.05066012,
  13. -0.00332941, -0.1488586 , 0.07936171, 0.06203181, -0.0645356 ,
  14. -0.07661135, -0.05883927, -0.00459472, -0.06721105, -0.02880175,
  15. -0.00337263, -0.00927516, 0.03289868, 0.10054352, -0.09545278,
  16. -0.0216963 , 0.11413048, -0.04580398, 0.02614305, -0.08269466,
  17. 0.01835637, 0.17654261, 0.0573773 , -0.06440263, 0.01176349,
  18. 0.00998674, 0.02840159, 0.14086637, -0.02473863, 0.05228964,
  19. -0.03329878, -0.02751228, -0.04788758, 0.1546051 , 0.05838795,
  20. -0.02351469, -0.01315547, -0.13732813, -0.08146078, 0.01943143,
  21. -0.08991284, 0.14222968, -0.14729632, 0.24547395, -0.05293949,
  22. 0.04446511, 0.05436133, -0.09403729, -0.0900671 , 0.04516568,
  23. 0.10035874, -0.03281724, 0.19480802, -0.11344203, -0.02487336,
  24. -0.08126407, -0.00491623, 0.04313428, -0.10474856, -0.11427435,
  25. -0.01765379, -0.04613522, 0.08338863, 0.00564523, 0.14067101,
  26. 0.05428562, 0.12530491, -0.2503076 ]], dtype=float32)]
  27. pth preds res: tensor([[-0.1313, 0.0404, 0.0529, 0.1013, -0.0335, 0.0310, 0.0638, -0.0354,
  28. -0.0337, 0.0636, -0.0067, -0.0200, -0.0132, -0.0345, 0.1779, 0.0813,
  29. 0.1023, -0.0912, 0.0117, 0.0318, -0.0590, 0.0157, -0.0473, -0.1255,
  30. 0.0069, -0.0075, -0.0373, 0.0535, 0.0305, 0.0296, 0.0039, 0.0020,
  31. 0.0130, -0.0399, -0.0629, -0.0826, -0.2890, 0.0284, -0.1718, -0.0556,
  32. -0.0371, 0.1090, 0.0658, 0.0496, 0.0151, 0.0547, 0.2866, 0.1184,
  33. -0.0607, -0.0520, -0.0348, -0.0619, -0.0443, 0.0843, -0.1208, 0.0016,
  34. -0.0854, 0.1199, 0.0262, 0.0507, -0.0033, -0.1489, 0.0794, 0.0620,
  35. -0.0645, -0.0766, -0.0588, -0.0046, -0.0672, -0.0288, -0.0034, -0.0093,
  36. 0.0329, 0.1005, -0.0955, -0.0217, 0.1141, -0.0458, 0.0261, -0.0827,
  37. 0.0184, 0.1765, 0.0574, -0.0644, 0.0118, 0.0100, 0.0284, 0.1409,
  38. -0.0247, 0.0523, -0.0333, -0.0275, -0.0479, 0.1546, 0.0584, -0.0235,
  39. -0.0132, -0.1373, -0.0815, 0.0194, -0.0899, 0.1422, -0.1473, 0.2455,
  40. -0.0529, 0.0445, 0.0544, -0.0940, -0.0901, 0.0452, 0.1004, -0.0328,
  41. 0.1948, -0.1134, -0.0249, -0.0813, -0.0049, 0.0431, -0.1047, -0.1143,
  42. -0.0177, -0.0461, 0.0834, 0.0056, 0.1407, 0.0543, 0.1253, -0.2503]],
  43. device='cuda:0', grad_fn=<DivBackward0>)
  44. onnx cost time: 0.0062367916107177734 pth cost time: 0.030622243881225586

三、ONNX转TensorRT

  1. import os
  2. import tensorrt as trt
  3. TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
  4. trt_runtime = trt.Runtime(TRT_LOGGER)
  5. BASE_DIR = os.path.dirname(os.path.abspath(__file__))
  6. EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
  7. os.environ['CUDA_VISIBLE_DEVICES'] = '2'
  8. def get_engine(input_shape, onnx_file_path = "", engine_file_path=""):
  9. """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
  10. def build_engine():
  11. """Takes an ONNX file and creates a TensorRT engine to run inference with"""
  12. with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser, builder.create_builder_config() as config:
  13. # builder.max_workspace_size = 1 << 32 # 256MiBs
  14. config.max_workspace_size = 1 << 33 # 1024MB
  15. # config.set_flag(trt.BuilderFlag.FP16) # 使用Fp16精度,如果使用FP32需要屏蔽这一句。
  16. builder.max_batch_size = 1
  17. # Parse model file
  18. if not os.path.exists(onnx_file_path):
  19. print('ONNX file {} not found, please run torch2onnx first to generate it.'.format(onnx_file_path))
  20. exit(0)
  21. print('Loading ONNX file from path {}...'.format(onnx_file_path))
  22. with open(onnx_file_path, 'rb') as model:
  23. print('Beginning ONNX file parsing')
  24. if not parser.parse(model.read()):
  25. print ('ERROR: Failed to parse the ONNX file.')
  26. for error in range(parser.num_errors):
  27. print (parser.get_error(error))
  28. return None
  29. # The actual yolov3.onnx is generated with batch size 64. Reshape input to batch size 1
  30. network.get_input(0).shape = input_shape
  31. print('Completed parsing of ONNX file')
  32. print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
  33. # config = trt.IBuilderConfig(max_workspace_size = 1 << 32)
  34. # config.
  35. engine = builder.build_engine(network, config)
  36. print("Completed creating Engine")
  37. with open(engine_file_path, "wb") as f:
  38. f.write(engine.serialize())
  39. return engine
  40. if os.path.exists(engine_file_path):
  41. # If a serialized engine exists, use it instead of building an engine.
  42. print("Reading engine from file {}".format(engine_file_path))
  43. with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
  44. return runtime.deserialize_cuda_engine(f.read())
  45. else:
  46. return build_engine()
  47. if __name__ == '__main__':
  48. onnx_file = 'resnet.onnx'
  49. engin_file = 'resnet.engine'
  50. input_shape = [1, 3, 224, 224]
  51. get_engine(input_shape, onnx_file, engin_file)

四、测试TensorRT模型精度

  1. import os
  2. import sys
  3. import cv2
  4. import copy
  5. import torch
  6. import numpy as np
  7. import time
  8. import onnxruntime
  9. import pycuda.driver as cuda
  10. import tensorrt as trt
  11. os.environ['CUDA_VISIBLE_DEVICES']='3'
  12. TRT_LOGGER = trt.Logger()
  13. import trt_common
  14. EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
  15. if sys.getdefaultencoding() != 'utf-8':
  16. reload(sys)
  17. sys.setdefaultencoding('utf-8')
  18. # Simple helper data class that's a little nicer to use than a 2-tuple.
  19. class HostDeviceMem(object):
  20. def __init__(self, host_mem, device_mem):
  21. self.host = host_mem
  22. self.device = device_mem
  23. def __str__(self):
  24. return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
  25. def __repr__(self):
  26. return self.__str__()
  27. def get_engine(engine_file_path):
  28. with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
  29. return runtime.deserialize_cuda_engine(f.read())
  30. # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
  31. def allocate_buffers(engine):
  32. inputs = []
  33. outputs = []
  34. bindings = []
  35. stream = cuda.Stream()
  36. for binding in engine:
  37. size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
  38. dtype = trt.nptype(engine.get_binding_dtype(binding))
  39. # Allocate host and device buffers
  40. host_mem = cuda.pagelocked_empty(size, dtype)
  41. device_mem = cuda.mem_alloc(host_mem.nbytes)
  42. # Append the device buffer to device bindings.
  43. bindings.append(int(device_mem))
  44. # Append to the appropriate list.
  45. if engine.binding_is_input(binding):
  46. inputs.append(HostDeviceMem(host_mem, device_mem))
  47. else:
  48. outputs.append(HostDeviceMem(host_mem, device_mem))
  49. return inputs, outputs, bindings, stream
  50. # This function is generalized for multiple inputs/outputs for full dimension networks.
  51. # inputs and outputs are expected to be lists of HostDeviceMem objects.
  52. def do_inference_v2(context, bindings, inputs, outputs, stream):
  53. # Transfer input data to the GPU.
  54. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
  55. # Run inference.
  56. context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
  57. # Transfer predictions back from the GPU.
  58. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
  59. # Synchronize the stream
  60. stream.synchronize()
  61. # Return only the host outputs.
  62. return [out.host for out in outputs]
  63. if __name__ == '__main__':
  64. os.environ['CUDA_VISIBLE_DEVICES']='3'
  65. onnx_name = 'resnet.onnx'
  66. trt_name = 'resnet.engine'
  67. session = onnxruntime.InferenceSession(onnx_name,providers=['CUDAExecutionProvider'])
  68. import pycuda.autoprimaryctx
  69. engine = get_engine(trt_name)
  70. context = engine.create_execution_context()
  71. inputs, outputs, bindings, stream = allocate_buffers(engine)
  72.     img = cv2.imread('test.jpg')
  73. img = cv2.resize(img, (224,224))
  74. img = img.transpose([2,0,1]).astype(np.float32)
  75. img = np.expand_dims(img, axis=0)
  76. t1 = time.time()
  77. onnx_preds = session.run(None, {"input": img})
  78. #print("onnx_preds: ", onnx_preds)
  79. t2 = time.time()
  80. inputs[0].host = np.ascontiguousarray(img)
  81. trt_outputs = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
  82. data = copy.deepcopy(trt_outputs[0])
  83. #print("preds: ", data)
  84. t3 = time.time()
  85. print("onnx: ", t2-t1, " trt: ", t3-t2)

五、ERROR

error1

ERROR: Failed to parse the ONNX file.

In node 84 (importConv): UNSUPPORTED_NODE: Assertion failed: inputs.at(2).is_weights() && "The bias tensor is required to be an initializer for the Conv operator."

solution:

pip install onnx-simplifier

通过simplify重新保存ONNX模型

  1. import onnx
  2. from onnxsim import simplify
  3. onnx_model = onnx.load('resnet.onnx')
  4. model_simp, check = simplify(onnx_model)
  5. onnx.save(model_simp, 'resnet_sim.onnx')

error2

ValueError: ndarray is not contiguous

solution:

数组不连续,使用np.ascontiguousarray(img) 处理数组

inputs[0].host = np.ascontiguousarray(img)

error3

Error Code 1: Myelin (Compiled against cuBLASLt 11.11.3.0 but running against cuBLASLt 11.4.1.0.)

solution:

tensorrt 和 torch同时使用调用了不同版本的libmyelin.so,不同同时使用。tensorrt和onnxruntime同时使用也会发生。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/57663
推荐阅读
相关标签
  

闽ICP备14008679号