Jetson Nano搭建人脸检测系统: （三）TensorRT优化

1、TensorRT简介

TensorRT是英伟达(NVIDIA)开发的一个可以在NVIDIA旗下的GPU上进行高性能推理的C++库。它的设计目标是与现有的深度学习框架无缝贴合：比如Mxnet, PyTorch, Tensorflow 以及Caffe等。TensorRT只关注推理阶段(inference stage)的优化。
了解更多参考：
https://blog.csdn.net/g11d111/article/details/92061884
//www.greatytc.com/p/c9bb92b85905

2、利用TensorRT优化人脸检测模型

上一篇文章（//www.greatytc.com/p/2f400c25179b）分别介绍了不同的人脸检测算法，本次我们选择第三种RFB-net进行Jetson Nano的部署。值得注意的是，TensorRT生成的序列化文件不能跨平台使用，比如在Jetson Nano上生成的文件不能在正常显卡端使用。
本来选用onnx模型进行解析，但是不知道为什么无法解析完整网络，因此最后选择了caffe模型进行解析。TensorRT推理的步骤分为三个：建造engine、解析engine、inference。代码如下：

# coding: UTF-8
import tensorrt as trt 
# tensorrt 运行过程中的log信息收集，trt.Builder的必要参数
TRT_LOGGER = trt.Logger()
# caffe model
deploy_file = './caffe/model/RFB-320/RFB-320.prototxt'
model_file = './caffe/model/RFB-320/RFB-320.caffemodel'
# 生成的文件名，可以是trt、plan、engine
trt_path = './face_dec32.trt'
# 定义数据类型，可选trt.float32、trt.float16、trt.int8
# 选择trt.int8需要矫正程序，Jetson Nano不支持INT8
DTYPE = trt.float32

#创建 builder, network, 和 parser
with trt.Builder(TRT_LOGGER) as builder, \
    builder.create_network() as network, \
    trt.CaffeParser() as parser:
      # 设置最大可用内存，比如：1 << 30表示1G，2*1 << 30表示2G
      builder.max_workspace_size = 1 << 30
     # 设置batch size，最好与推理过程的batch一致可以到达最佳优化
      builder.max_batch_size = 1
      print("Building TensorRT engine. This may take few minutes.")
      # 解析器返回 model_tensors，它是一个表，包含从张量名称到 ITensor 对象的映射。
      model_tensors = parser.parse(deploy=deploy_file, model=model_file, network=network, dtype=DTYPE)
      # 检查必要节点的形状大小
      input = model_tensors.find('input')
      box = model_tensors.find('boxes')
      scores = model_tensors.find('scores')
      # output节点是我加入的一个Concat层：合并boxes与scores的新节点
      #  直接修改deploy_file文件即可
      output = model_tensors.find('output')
      for each in [input,box,scores,output]:
            print('\033[33m\tname:{name},  shape:{shape}\033[0m'.format(name=each.name, shape=each.shape))
      '''
      输出信息如下：
          name:input,   shape:(3, 240, 320)
          name:boxes,   shape:(4420, 4)
          name:scores,  shape:(4420, 2)
          name:output,  shape:(4420, 6)
      '''
      # 设置network的输出
      network.mark_output(output)
      # 建造engine并保存，时间较长需等待一会
      engine = builder.build_cuda_engine(network)
      with open(trt_path, "wb") as f:
            f.write(engine.serialize())
# 读取保存的trt文件
with open(trt_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
      engine = runtime.deserialize_cuda_engine(f.read())
# 检查输入输出
print(engine.get_binding_shape(0))  # (3, 240, 320)
print(engine.get_binding_shape(1))  # (4420, 6)

如果出现以下错误提示：
Caffe Parser: Invalid reshape param. TensorRT does not support reshape in N (batch) dimension
将所有Reshape层的第一个维度参数由1变成0，例如：

Reshape层参数修改

改写的output层如下：

output层修改

3、在Jetson Nano上部署TRT文件

上面已经得到了TensorRT优化后的序列文件，下面将上述模型在Jetson Nano上进行推理。推理代码如下：

import tensorrt as trt
import numpy as np
import cv2
import pycuda.driver as cuda
import time
import os
import pycuda.autoinit
from box_util import *
TRT_LOGGER = trt.Logger()
trt_path = './face_dec.trt'

# 加载数据并将其喂入提供的pagelocked_buffer中.
def load_normalized_data(data_path, pagelocked_buffer, target_size=(320, 240)):
    ori_image = cv2.imread(data_path)
    ori_image = cv2.cvtColor(ori_image, cv2.COLOR_BGR2RGB)
    image = (cv2.resize(ori_image, (320, 240)) - 127.0) / 128
    image = np.transpose(image, [2, 0, 1])
    # Flatten the image into a 1D array, normalize, and copy to pagelocked memory.
    np.copyto(pagelocked_buffer, image.ravel())
    return ori_image


# 初始化(创建引擎，为输入输出开辟&分配显存/内存.)
def init():
    with open(trt_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
      engine = runtime.deserialize_cuda_engine(f.read())

    print(engine.get_binding_shape(0))
    print(engine.get_binding_shape(1))
    # 1. Allocate some host and device buffers for inputs and outputs:
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(trt.float32))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(trt.float32))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    context = engine.create_execution_context()
    return context, h_input, h_output, stream, d_input, d_output

# @profile
def inference(data_path):
  global context, h_input, h_output, stream, d_input, d_output
  image = load_normalized_data(data_path, pagelocked_buffer=h_input)
  cuda.memcpy_htod_async(d_input, h_input, stream)
  # Run inference.
  context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
  cuda.memcpy_dtoh_async(h_output, d_output, stream)
  stream.synchronize()
  return h_output, image

if __name__ == '__main__':
    img_path = './img/'
    context, h_input, h_output, stream, d_input, d_output = init()  
    # 加快推理速度，提前计算好priors box, 修改一下box_util.py
    priors = np.load('priors.npy')
    listdir = os.listdir(img_path)
    listdir = [each.strip() for each in listdir]
    print(listdir)
    for _ in range(1):
      for file_path in listdir:
        t1 = time.time()
        output, image = inference(img_path + file_path)
        # caffe模型没有进行后处理操作，因此要进行后处理，所有函数都在box_util.py文件中
        # 如果想取出每个output，最好加上output.copy()
        fix_image(output.copy(), image, priors, file_path)
        print("推理时间", time.time() - t1)

box_util.py文件的fix_image函数如下：

def fix_image(output, img_ori, priors, file_path=None):
    # 改变output形状，解析出boxes和scores
    output = np.expand_dims(np.reshape(output, (-1, 6)), axis=0)
    # priors = define_img_size(input_size)
    boxes = output[:, :, :4]
    scores = output[:, :, 4:]
    boxes = convert_locations_to_boxes(boxes, priors, center_variance, size_variance)
    boxes = center_form_to_corner_form(boxes)
    boxes, labels,  probs = predict(img_ori.shape[1], img_ori.shape[0], scores, boxes, \
                                  prob_threshold=prob_threshold, iou_threshold=iou_threshold)
    # if len(boxes) == 0:
    #     return []
    # return boxes[0, :]
    for i in range(boxes.shape[0]):
        box = boxes[i, :]
        cv2.rectangle(img_ori, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
    cv2.imwrite(os.path.join('./result/', file_path), img_ori)

另外一个需要注意的地方是，如果你想通过cv2调用摄像头，需要一些特殊的设置：

def get_jetson_gstreamer_source(capture_width=1280, capture_height=720, display_width=1280, display_height=720,
                                framerate=60, flip_method=2):
  # 可以修改一下flip_method的值，将倒置的画面修正
  """
  Return an OpenCV-compatible video source description that uses gstreamer to capture video from the camera on a Jetson Nano
  """
  return (
      f'nvarguscamerasrc ! video/x-raw(memory:NVMM), ' +
      f'width=(int){capture_width}, height=(int){capture_height}, ' +
      f'format=(string)NV12, framerate=(fraction){framerate}/1 ! ' +
      f'nvvidconv flip-method={flip_method} ! ' +
      f'video/x-raw, width=(int){display_width}, height=(int){display_height}, format=(string)BGRx ! ' +
      'videoconvert ! video/x-raw, format=(string)BGR ! appsink')

cv2.VideoCapture(get_jetson_gstreamer_source(), cv2.CAP_GSTREAMER)

4、总结

这里我们通过TensorRT将人脸检测模型的caffe--->trt文件，并进行了推理部署。下一篇我们将优化这个后处理程序，将这一段计算也加入到trt文件中，直接输出box的坐标，简化计算过程。

Jetson Nano搭建人脸检测系统: （三）TensorRT优化

Jetson Nano搭建人脸检测系统: （三）TensorRT优化

目录

1、TensorRT简介

2、利用TensorRT优化人脸检测模型

3、在Jetson Nano上部署TRT文件

4、总结

推荐阅读更多精彩内容