赞
踩
配置:
Ubuntu 16.04
python 3.6
onnx 1.6
pytorch 1.5
pycuda 2019.1.2
torchvision 0.1.8
建议详读,先安装好环境
官网指导手册
步骤:
1.将pytorch模型转换成onnx模型
这边用的是Darknet生成的pytoch模型
import torch
from torch.autograd import Variable
import onnx
input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(1, 3, 544, 544)).cuda()
model = x.model.cuda()#x.model为我生成的模型
# model = torch.load('', map_location="cuda:0")
torch.onnx.export(model, input, 'model.onnx', input_names=input_name, output_names=output_name, verbose=True)
其中
#model = x.model.cuda()
#若是不添加cuda()
model = x.model
出现报错
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
2.检查模型
model = onnx.load("model.onnx")
onnx.checker.check_model(model)
print("==> Passed")
3.测试onnx模型使用tensorrt推理前后对比
import pycuda.autoinit import numpy as np import pycuda.driver as cuda import tensorrt as trt import torch import os import time from PIL import Image import cv2 import torchvision filename = '000000.jpg' max_batch_size = 1 onnx_model_path = 'yolo.onnx' TRT_LOGGER = trt.Logger() # This logger is required to build an engine def get_img_np_nchw(filename): image = cv2.imread(filename) image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image_cv = cv2.resize(image_cv, (1920, 1080)) miu = np.array([0.485, 0.456, 0.406]) std = np.array([0.229, 0.224, 0.225]) img_np = np.array(image_cv, dtype=float) / 255. r = (img_np[:, :, 0] - miu[0]) / std[0] g = (img_np[:, :, 1] - miu[1]) / std[1] b = (img_np[:, :, 2] - miu[2]) / std[2] img_np_t = np.array([r, g, b]) img_np_nchw = np.expand_dims(img_np_t, axis=0) return img_np_nchw class HostDeviceMem(object): def __init__(self, host_mem, device_mem): """Within this context, host_mom means the cpu memory and device means the GPU memory """ self.host = host_mem self.device = device_mem def __str__(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \ fp16_mode=False, int8_mode=False, save_engine=False, ): """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" def build_engine(max_batch_size, save_engine): """Takes an ONNX file and creates a TensorRT engine to run inference with""" EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) with trt.Builder(TRT_LOGGER) as builder, \ builder.create_network(EXPLICIT_BATCH) as network, \ trt.OnnxParser(network, TRT_LOGGER) as parser: builder.max_workspace_size = 1 << 30 # Your workspace size builder.max_batch_size = max_batch_size # pdb.set_trace() builder.fp16_mode = fp16_mode # Default: False builder.int8_mode = int8_mode # Default: False if int8_mode: # To be updated raise NotImplementedError # Parse model file if not os.path.exists(onnx_file_path): quit('ONNX file {} not found'.format(onnx_file_path)) print('Loading ONNX file from path {}...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model: print('Beginning ONNX file parsing') parser.parse(model.read()) if not parser.parse(model.read()): for error in range(parser.num_errors): print(parser.get_error(error)) print("===========Parsing fail!!!!=================") else : print('Completed parsing of ONNX file') print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) engine = builder.build_cuda_engine(network) print("Completed creating Engine") if save_engine: with open(engine_file_path, "wb") as f: f.write(engine.serialize()) return engine if os.path.exists(engine_file_path): # If a serialized engine exists, load it instead of building a new one. print("Reading engine from file {}".format(engine_file_path)) with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: return runtime.deserialize_cuda_engine(f.read()) else: return build_engine(max_batch_size, save_engine) def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): # Transfer data from CPU to the GPU. [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] # Run inference. context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) # Transfer predictions back from the GPU. [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] # Synchronize the stream stream.synchronize() # Return only the host outputs. return [out.host for out in outputs] def postprocess_the_outputs(h_outputs, shape_of_output): h_outputs = h_outputs.reshape(*shape_of_output) return h_outputs img_np_nchw = get_img_np_nchw(filename) img_np_nchw = img_np_nchw.astype(dtype=np.float32) # These two modes are dependent on hardwares fp16_mode = False int8_mode = False trt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode) # Build an engine engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode) # Create the context for this engine context = engine.create_execution_context() # Allocate buffers for input and output inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings # Do inference shape_of_output = (max_batch_size, 1000) # Load data to the buffer inputs[0].host = img_np_nchw.reshape(-1) # inputs[1].host = ... for multiple input t1 = time.time() trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data t2 = time.time() feat = postprocess_the_outputs(trt_outputs[0], shape_of_output) print('TensorRT ok') #将model改为自己的模型,此处为pytoch的resnet50,需联网下载 model = torchvision.models.resnet50(pretrained=True).cuda() resnet_model = model.eval() input_for_torch = torch.from_numpy(img_np_nchw).cuda() t3 = time.time() feat_2= resnet_model(input_for_torch) t4 = time.time() feat_2 = feat_2.cpu().data.numpy() print('Pytorch ok!') mse = np.mean((feat - feat_2)**2) print("Inference time with the TensorRT engine: {}".format(t2-t1)) print("Inference time with the PyTorch model: {}".format(t4-t3)) print('MSE Error = {}'.format(mse)) print('All completed!')
报错:
In node -1 (importModel): INVALID_VALUE: Assertion failed: !_importer_ctx.network()->hasImplicitBatchDimension() && "This version of the ONNX parser only supports TensorRT INetworkDefinitions with an explicit batch dimension. Please ensure the network was created using the EXPLICIT_BATCH NetworkDefinitionCreationFlag."
解决:
def build_engine(max_batch_size, save_engine):
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(EXPLICIT_BATCH) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
报错:
Traceback (most recent call last):
line 126, in <listcomp>
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
解决:
def get_img_np_nchw(filename):
image = cv2.imread(filename)
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_cv = cv2.resize(image_cv, (1920, 1080))
输入的检测图像尺寸需要resize成model的input的size
改为
def get_img_np_nchw(filename):
image = cv2.imread(filename)
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_cv = cv2.resize(image_cv, (544,544))
报错
line 139, in postprocess_the_outputs
h_outputs = h_outputs.reshape(*shape_of_output)
ValueError: cannot reshape array of size 5780 into shape (1,1000)
解决:
#shape_of_output = (max_batch_size, 1000)
#修改成自己模型ouput的大小
shape_of_output = (1,20,17,17)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。