TX2上使用tensorrt加速推理onnx_trt_engine_datatype=trt.datatype.float

作者：infabc | 2024-02-03 18:41:18

踩

trt_engine_datatype=trt.datatype.float

前言：1.关于onnx介绍自己查阅相关资料,onnx文件保证已经成功获得的前提下进行以下内容。

2.为什么使用tensorrt框架加速，看这篇不同的部署框架的区别python与C++的效率区别、模型部署/ONNXRuntime/tensorrt

TX2在装系统时是默认安装tensorrt的，位置在/usr/src/tensorrt中,如图

我们只需要将我们自己的虚拟环境和系统自带的tensorrt建立软连接即可；具体方法见Jetson Nano、TX2等 conda 虚拟环境中使用TensorRT、gi等

建立好软连接后就开始根据我们的onnx文件生成tensorrt加速后的trt文件了。

进入到bin文件中，在该文件中打开终端

输入：

./trtexec --onnx=/home/zeh/Desktop/FPI/sim_file/net_cg.onnx --saveEngine=/home/zeh/Desktop/FPI/sim_file/net_cg.trt --workspace=1024

相当于是 ./trtexec --onnx=要加速的onnx文件的路径 --saveEngine=生成的trt加速后的文件保存路径 --workspace=1024(设置工作空间，不加可能会报错）

如：

成功截图：成功生成trt文件。

生成trt文件后就可以在tx2上进行推理了。下面是我的推理代码,包括predict和inference两个文件。

predict.py

import cv2

import tensorrt as trt

import numpy as np

import inference as inference_utils # TRT/TF inference wrappers

import torch

import os

import time

import exifread,re,json,requests,math,tqdm

from make_dataset import SiamUAV_test

# from tool import get_logger

if __name__ == "__main__":

# 1. 网络构建

# Precision command line argument -> TRT Engine datatype

TRT_PRECISION_TO_DATATYPE = {

16: trt.DataType.HALF,

32: trt.DataType.FLOAT

}

# datatype: float 32

trt_engine_datatype = TRT_PRECISION_TO_DATATYPE[16]

# batch size = 1

max_batch_size = 1

engine_file_path = "/home/zeh/Desktop/FPI/sim_file/net_cg.trt"

onnx_file_path = "/home/zeh/Desktop/FPI/sim_file/net_cg.onnx"

trt_inference_wrapper = inference_utils.TRTInference(

engine_file_path, onnx_file_path,

trt_engine_datatype, max_batch_size,

)

test_dir = "/home/zeh/Desktop/fpi_picture" #加载自己的数据集

dataset_test = SiamUAV_test(test_dir)

dataloaders = torch.utils.data.DataLoader(dataset_test,batch_size = 1, shuffle = False, num_workers = 0, pin_memory = True)

for uav, satellite, X, Y, uav_path, sa_path in dataloaders :

z = uav.numpy()

x = satellite.numpy()

#前面是处理自己的数据集

trt_outputs = trt_inference_wrapper.infer(z,x) #推理在这我的模型输入有两个

#后面是处理trt_outputs(推理结果）

map = torch.tensor(trt_outputs)[0]

map = torch.reshape(map,(384,384))

map = torch.sigmoid(map)

inference.py

import os

import sys

import time

# from PIL import Image

import tensorrt as trt

import pycuda.driver as cuda

import pycuda.autoinit

import numpy as np

import cv2

# TensorRT logger singleton

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def allocate_buffers(engine):

inputs = []

outputs = []

bindings = []

stream = cuda.Stream()

class HostDeviceMem(object):

def __init__(self, host_mem, device_mem):

self.host = host_mem

self.device = device_mem

def __str__(self):

return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

def __repr__(self):

return self.__str__()

for binding in engine:

shape = engine.get_binding_shape(binding)

size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size

dtype = trt.nptype(engine.get_binding_dtype(binding))

# Allocate host and device buffers

host_mem = cuda.pagelocked_empty(size, dtype)

device_mem = cuda.mem_alloc(host_mem.nbytes)

# Append the device buffer to device bindings.

bindings.append(int(device_mem))

# Append to the appropriate list.

if engine.binding_is_input(binding):

inputs.append(HostDeviceMem(host_mem, device_mem))

else:

outputs.append(HostDeviceMem(host_mem, device_mem))

return inputs, outputs, bindings, stream

def load_engine(trt_path):

# 反序列化引擎

with open(trt_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:

return runtime.deserialize_cuda_engine(f.read())

class TRTInference(object):

"""Manages TensorRT objects for model inference."""

def __init__(self, trt_engine_path, onnx_model_path, trt_engine_datatype=trt.DataType.FLOAT, batch_size=1):

"""Initializes TensorRT objects needed for model inference.

Args:

trt_engine_path (str): path where TensorRT engine should be stored

uff_model_path (str): path of .uff model

trt_engine_datatype (trt.DataType):

requested precision of TensorRT engine used for inference

batch_size (int): batch size for which engine

should be optimized for

"""

# Initialize runtime needed for loading TensorRT engine from file

# TRT engine placeholder

self.trt_engine = None

# Display requested engine settings to stdout

print("TensorRT inference engine settings:")

print(" * Inference precision - {}".format(trt_engine_datatype))

print(" * Max batch size - {}\n".format(batch_size))

# If we get here, the file with engine exists, so we can load it

if not self.trt_engine:

print("Loading cached TensorRT engine from {}".format(

trt_engine_path))

self.trt_engine = load_engine(

trt_engine_path)

# This allocates memory for network inputs/outputs on both CPU and GPU

self.inputs, self.outputs, self.bindings, self.stream = allocate_buffers(self.trt_engine)

# Execution context is needed for inference

self.context = self.trt_engine.create_execution_context()

def infer(self, full_img1,full_img2):

"""Infers model on given image.

Args:

image_path (str): image to run object detection model on

"""

# 归一化

scale_img1 = full_img1

scale_img2 = full_img2

# if scale_img1.max() > 1:

# scale_img1 = scale_img1 / 255

# if scale_img2.max() > 1:

# scale_img2 = scale_img2 / 255

# 扩增通道数

# scale_img = np.expand_dims(scale_img, axis=0)

# 将数据成块

scale_img1 = np.array(scale_img1, dtype=np.float32, order='C')

scale_img2 = np.array(scale_img2, dtype=np.float32, order='C')

# Copy it into appropriate place into memory

# (self.inputs was returned earlier by allocate_buffers())

np.copyto(self.inputs[0].host, scale_img1.ravel())

np.copyto(self.inputs[1].host, scale_img2.ravel())

# Output shapes expected by the post-processor

# output_shapes = [(1, 11616, 4), (11616, 21)]

# When infering on single image, we measure inference

# time to output it to the user

inference_start_time = time.time()

# Fetch output from the model

trt_outputs = do_inference(

self.context, bindings=self.bindings, inputs=self.inputs,

outputs=self.outputs, stream=self.stream)

print("network output shape:{}".format(trt_outputs[0].shape))

# Output inference time

print("TensorRT inference time: {} ms".format(

int(round((time.time() - inference_start_time) * 1000))))

# Before doing post-processing, we need to reshape the outputs as the common.do_inference will

# give us flat arrays.

outputs = [output for output in trt_outputs]

# outputs = trt_outputs.reshape(384,384)

# And return results

return outputs

# This function is generalized for multiple inputs/outputs.

# inputs and outputs are expected to be lists of HostDeviceMem objects.

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

# Transfer input data to the GPU.

[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

# Run inference.

context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)

# Transfer predictions back from the GPU.

[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

# Synchronize the stream

stream.synchronize()

# Return only the host outputs.

return [out.host for out in outputs]

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/57623