TensorRT 算子¶
此样板代码提供了一个框架来运行所有算子示例。为了使它们可运行,请将具体的示例代码复制粘贴到指定的“example begin”和“example end”注释之间。
import numpy as np
import math # example_plugin_v2.py
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
class OutputAllocator(trt.IOutputAllocator):
def __init__(self, curr_size):
self.curr_size = curr_size
self.allocated_mem = None
if curr_size > 0:
self.allocated_mem = cuda.mem_alloc(curr_size)
self.tensor_shape = None
def reallocate_output(self, tensor_name, memory, size, alignment):
assert size > 0
if size > self.curr_size:
self.allocated_mem = cuda.mem_alloc(size)
return int(self.allocated_mem)
def notify_shape(self, tensor_name, shape):
self.tensor_shape = shape
class Runner:
def __init__(self, logger=trt.Logger(min_severity=trt.ILogger.Severity.INFO)):
self.builder = trt.Builder(logger)
self.network = self.builder.create_network(flags=0)
self.config = self.builder.create_builder_config()
self.runtime = trt.Runtime(logger)
self.inputs = {}
self.outputs = {}
self.expected = {}
self.results = {}
self.logger = logger
def example(get_runner: Runner):
network = get_runner.network
inputs = get_runner.inputs
outputs = get_runner.outputs
expected = get_runner.expected
# -------------------- Example Begin --------------------
# Paste the code examples here
# e.g. for Activation
in1 = network.add_input("input1", dtype=trt.float32, shape=(2, 3))
layer = network.add_activation(in1, type=trt.ActivationType.RELU)
inputs[in1.name] = np.array([[-3.0, -2.0, -1.0], [0.0, 1.0, 2.0]])
outputs[layer.get_output(0).name] = layer.get_output(0).shape
expected[layer.get_output(0).name] = np.array([[0.0, 0.0, 0.0], [0.0, 1.0, 2.0]])
# --------------------- Example End ---------------------
return get_runner
def run_example():
example_runner = Runner()
atol = 0.1
network = example_runner.network
inputs = example_runner.inputs
outputs = example_runner.outputs
expected = example_runner.expected
builder = example_runner.builder
config = example_runner.config
runtime = example_runner.runtime
results = example_runner.results
def log_info(info):
example_runner.logger.log(trt.ILogger.Severity.INFO, f"[Example] {info}")
def log_error(info):
example_runner.logger.log(trt.ILogger.Severity.ERROR, f"[Example] {info}")
example_runner = example(example_runner)
log_info("Building serialized network")
serialized_engine = builder.build_serialized_network(network, config)
assert serialized_engine is not None
log_info("Creating engine")
engine = runtime.deserialize_cuda_engine(serialized_engine)
context = engine.create_execution_context()
# Allocate host and device buffers
in_mem = []
out_mem = dict()
output_allocators = dict()
tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
for tensor in tensor_names:
dtype = trt.nptype(engine.get_tensor_dtype(tensor))
if engine.get_tensor_mode(tensor) == trt.TensorIOMode.INPUT:
if engine.is_shape_inference_io(tensor):
context.set_input_shape(tensor, inputs[tensor].shape)
# Get input memory address from the numpy object.
input_address = inputs[tensor].ctypes.data
context.set_tensor_address(tensor, input_address)
# Handle input tensors
context.set_input_shape(tensor, inputs[tensor].shape)
input_buffer = np.ascontiguousarray(inputs[tensor], dtype=dtype)
input_memory = cuda.mem_alloc(input_buffer.nbytes)
context.set_tensor_address(tensor, int(input_memory))
in_mem.append((input_memory, input_buffer))
else: # Handle output tensors
# Check if output tensor contains unknown shape
if trt.volume(context.get_tensor_shape(tensor)) < 0:
# Set an output allocator for the output tensor with unknown shape.
# Initialize output allocator with 0 memory size, so reallocate always allocate.
output_allocator = OutputAllocator(0)
context.set_output_allocator(tensor, output_allocator)
output_allocators[tensor] = output_allocator
# No need to initialize output buffer and output memory here.
out_mem[tensor] = None
size = trt.volume(context.get_tensor_shape(tensor))
output_buffer = cuda.pagelocked_empty(size, dtype)
output_memory = cuda.mem_alloc(output_buffer.nbytes)
context.set_tensor_address(tensor, int(output_memory))
out_mem[tensor] = (output_buffer, output_memory)
stream = cuda.Stream()
# Transfer input data to the GPU.
for input in in_mem:
cuda.memcpy_htod_async(input[0], input[1], stream)
# Run inference
log_info("Running example")
# Transfer prediction output from the GPU.
for output in out_mem:
output_mem = out_mem[output]
if output_mem is None:
# Must have been allocated using OutputAllocator.reallocate.
assert output in output_allocators
assert output_allocators[output].allocated_mem
shape = output_allocators[output].tensor_shape
assert shape is not None
size = trt.volume(shape)
dtype = trt.nptype(engine.get_tensor_dtype(output))
output_buffer = cuda.pagelocked_empty(size, dtype)
output_memory = context.get_tensor_address(output)
output_mem = (output_buffer, output_memory)
# Store tensor to output buffer and output memory mappings.
out_mem[output] = output_mem
cuda.memcpy_dtoh_async(output_mem[0], output_mem[1], stream)
log_info("Synchronizing with cuda stream")
log_info("Sync done")
for output in out_mem:
output_mem = out_mem[output][0]
shape = outputs[output]
if trt.volume(context.get_tensor_shape(tensor)) < 0:
# Get real output tensor size
shape = output_allocators[output].tensor_shape
assert shape is not None
size = trt.volume(shape)
output_mem = output_mem[:size]
output_mem = output_mem.reshape(shape)
results[output] = output_mem
log_info(f"Network inputs: {inputs}")
log_info(f"Inference results: {results}")
log_info(f"Expected results: {expected}")
# Check result
is_equal = {}
all_are_equal = True
for output in expected:
is_equal[output] = np.allclose(results[output], expected[output], atol=atol)
all_are_equal &= is_equal[output]
log_info(f"All results are expected: {all_are_equal}")
if all_are_equal is False:
for output in is_equal:
if is_equal[output] is False:
log_error(f"{output} mismatch:")
log_error(f"expected - content:{expected[output]}")
log_error(f"actual - content:{repr(results[output])}")
log_info("Example complete")
if __name__ == "__main__":