我使用Nvidia的迁移学习工具包(TLT)进行训练,然后使用tlt-converter将.etlt模型转换为.engine文件。
我想使用这个 .engine 文件在 python 中进行推理。但由于我使用 TLT 进行训练,因此我没有任何冻结图或 pb 文件,而这正是所有 TensorRT 推理教程所需要的。
我想知道 python 推理是否可以在 .engine 文件上进行。 如果没有,支持哪些转换(UFF、ONNX)来实现这一点?
可以通过 .engine 文件进行 Python 推理。下面的示例从磁盘加载 .trt 文件(实际上与 .engine 文件相同)并执行单个推理。
在这个项目中,我在使用之前使用 onnx2trt 可执行文件将 ONNX 模型转换为 TRT 模型。您甚至可以使用 ONNX 作为中间件将 PyTorch 模型转换为 TRT。
import tensorrt as trt
import numpy as np
import os
import pycuda.driver as cuda
import pycuda.autoinit
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TrtModel:
def __init__(self,engine_path,max_batch_size=1,dtype=np.float32):
self.engine_path = engine_path
self.dtype = dtype
self.logger = trt.Logger(trt.Logger.WARNING)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.max_batch_size = max_batch_size
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
host_mem = cuda.pagelocked_empty(size, self.dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def __call__(self,x:np.ndarray,batch_size=2):
x = x.astype(self.dtype)
np.copyto(self.inputs[0].host,x.ravel())
for inp in self.inputs:
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
self.stream.synchronize()
return [out.host.reshape(batch_size,-1) for out in self.outputs]
if __name__ == "__main__":
batch_size = 1
trt_engine_path = os.path.join("..","models","main.trt")
model = TrtModel(trt_engine_path)
shape = model.engine.get_binding_shape(0)
data = np.random.randint(0,255,(batch_size,*shape[1:]))/255
result = model(data,batch_size)
大家注意安全!
您可以使用python来推断.engine文件。有两种方法可以做到这一点,
我已经更新了 @Oguz Vuruskaner 的 answer 和 article 以支持新版本的 TensorRT
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit # Note: required! to initialize pycuda
import tensorrt as trt
class TensorRTInference:
def __init__(self, engine_path):
# initialize
self.logger = trt.Logger(trt.Logger.ERROR)
self.runtime = trt.Runtime(self.logger)
# setup
self.engine = self.load_engine(engine_path)
self.context = self.engine.create_execution_context()
# allocate buffers
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(
self.engine
)
def load_engine(self, engine_path):
# loads the model from given filepath
with open(engine_path, "rb") as f:
engine = self.runtime.deserialize_cuda_engine(f.read())
return engine
class HostDeviceMem:
def __init__(self, host_mem, device_mem, shape):
# keeping track of addresses
self.host = host_mem
self.device = device_mem
# keeping track of shape to un-flatten it later
self.shape = shape
def allocate_buffers(self, engine):
inputs, outputs, bindings = [], [], []
stream = cuda.Stream()
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
shape = engine.get_tensor_shape(tensor_name)
size = trt.volume(shape)
dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
# allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# append the device buffer address to device bindings
bindings.append(int(device_mem))
# append to the appropiate input/output list
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
inputs.append(self.HostDeviceMem(host_mem, device_mem, shape))
else:
outputs.append(self.HostDeviceMem(host_mem, device_mem, shape))
return inputs, outputs, bindings, stream
def infer(self, input_data):
# transfer input data to device
np.copyto(self.inputs[0].host, input_data.ravel())
cuda.memcpy_htod_async(self.inputs[0].device, self.inputs[0].host, self.stream)
# set tensor address
for i in range(self.engine.num_io_tensors):
self.context.set_tensor_address(
self.engine.get_tensor_name(i), self.bindings[i]
)
# run inference
self.context.execute_async_v3(stream_handle=self.stream.handle)
# transfer predictions back
for i in range(len(self.outputs)):
cuda.memcpy_dtoh_async(
self.outputs[i].host, self.outputs[i].device, self.stream
)
# synchronize the stream
self.stream.synchronize()
# un-flatten the outputs
outputs = []
for i in range(len(self.outputs)):
output = self.outputs[i].host
output = output.reshape(self.outputs[i].shape)
outputs.append(output)
return outputs
注意:上面的代码片段与我引用的文章几乎相同,但有一些小调整。
如果您还在模型中使用 plugins,则必须在调用
self.load_engine(engine_path)
之前添加以下内容
# loading plugins
# Note: default namespace is ""
# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#register-plugin-create
trt.init_libnvinfer_plugins(self.logger, namespace="")