概述

此应用程序在输入视频中跟踪边界框，在每一帧上绘制它们，并将结果保存在视频文件中。您可以定义将用于处理的后端。

注意: 输出将为灰度，因为该算法目前不支持彩色输入。

说明

命令行参数为

<后端> <输入视频> <输入边界框>

其中

backend: cpu 或 cuda；它定义了将执行处理的后端。
input video: 输入视频文件名，它接受 OpenCV 的 cv::VideoCapture 接受的所有视频类型。
input bboxes: 包含输入边界框以及它们出现帧的文件。该文件由多行组成，格式如下
```
   <frame> <bbox_x> <bbox_y> <bbox_width> <bbox_height>
```
重要的是，这些行按照帧的升序排列。

这是一个示例

C++
./vpi_sample_06_klt_tracker cuda ../assets/dashcam.mp4 ../assets/dashcam_bboxes.txt
Python
python3 main.py cuda ../assets/dashcam.mp4 ../assets/dashcam_bboxes.txt

这是使用 CUDA 后端以及提供的示例视频和边界框。它将把跟踪的边界框渲染到 klt_cuda.mp4 中。

结果

跟踪结果

注意: 视频输出需要支持 HTML5 且支持 H.264 mp4 视频解码的浏览器。

源代码

为了方便起见，以下代码也安装在 samples 目录中。

语言 C++ Python

 from __future__ import print_function
 
 import sys
 from argparse import ArgumentParser
 import numpy as np
 import vpi
 import cv2
 
 
 # Convert a colored input frame to grayscale (if needed)
 # and then, if using PVA backend, convert it to 16-bit unsigned pixels;
 # The converted frame is copied before wrapping it as a VPI image so
 # later draws in the gray frame do not change the reference VPI image.
 def convertFrameImage(inputFrame, backend)
  if inputFrame.ndim == 3 and inputFrame.shape[2] == 3
  grayFrame = cv2.cvtColor(inputFrame, cv2.COLOR_BGR2GRAY)
  else
  grayFrame = inputFrame
  if backend == vpi.Backend.PVA
  # PVA only supports 16-bit unsigned inputs,
  # where each element is in 0-255 range, so
  # no rescaling is needed.
  grayFrame = grayFrame.astype(np.uint16)
  grayImage = vpi.asimage(grayFrame.copy())
  return grayFrame, grayImage
 
 
 # Write the input gray frame to output video with
 # input bounding boxes and predictions
 def writeOutput(outVideo, cvGray, inBoxes, inPreds, colors, backend)
  try
  if cvGray.dtype == np.uint16
  cvGray = cvGray.astype(np.uint8)
  if cvGray.dtype != np.uint8
  raise Exception('Input frame format must be grayscale, 8-bit unsigned')
  cvGrayBGR = cv2.cvtColor(cvGray, cv2.COLOR_GRAY2BGR)
 
  # Tracking the number of valid bounding boxes in the current frame
  numValidBoxes = 0
 
  # Draw the input bounding boxes considering the input predictions
  with inBoxes.rlock_cpu(), inPreds.rlock_cpu() as pred
  # Array of bounding boxes (bbox) and predictions (pred)
  bbox = inBoxes.cpu().view(np.recarray)
 
  for i in range(inBoxes.size)
  if bbox[i].tracking_status == vpi.KLTTrackStatus.LOST
  # If the tracking status of the current bounding box is lost, skip it
  continue
 
  # Gather information of the current (i) bounding box and prediction
  # Prediction scaling width, height and x, y
  predScaleWidth = pred[i][0, 0]
  predScaleHeight = pred[i][1, 1]
  predX = pred[i][0, 2]
  predY = pred[i][1, 2]
 
  # Bounding box scaling width, height and x, y and bbox width, height
  bboxScaleWidth = bbox[i].bbox.xform.mat3[0, 0]
  bboxScaleHeight = bbox[i].bbox.xform.mat3[1, 1]
  bboxX = bbox[i].bbox.xform.mat3[0, 2]
  bboxY = bbox[i].bbox.xform.mat3[1, 2]
  bboxWidth = bbox[i].bbox.width
  bboxHeight = bbox[i].bbox.height
 
  # Compute corrected x, y and width, height (w, h) by proper adding
  # bounding box and prediction x, y and by proper multiplying
  # bounding box w, h with its own scaling and prediction scaling
  x = bboxX + predX
  y = bboxY + predY
  w = bboxWidth * bboxScaleWidth * predScaleWidth
  h = bboxHeight * bboxScaleHeight * predScaleHeight
 
  # Start point and end point of the bounding box for OpenCV drawing
  startPoint = tuple(np.array([x, y], dtype=int))
  endPoint = tuple(np.array([x, y], dtype=int) + np.array([w, h], dtype=int))
 
  # The color of the bounding box to be drawn
  bboxColor = tuple([ int(c) for c in colors[0, i] ])
  cv2.rectangle(cvGrayBGR, startPoint, endPoint, bboxColor, 2)
 
  # Incrementing the number of valid bounding boxes in the current frame
  numValidBoxes += 1
 
  print(' Valid: {:02d} boxes'.format(numValidBoxes))
 
  outVideo.write(cvGrayBGR)
  except Exception as e
  print('Error while writing output video:\n', e, file=sys.stderr)
  exit(1)
 
 
 # ----------------------------
 # Parse command line arguments
 
 parser = ArgumentParser()
 parser.add_argument('backend', choices=['cpu','cuda','pva'],
  help='Backend to be used for processing')
 
 parser.add_argument('input',
  help='Input video')
 
 parser.add_argument('boxes',
  help='Text file with bounding boxes description')
 
 args = parser.parse_args()
 
 if args.backend == 'cpu'
  backend = vpi.Backend.CPU
 elif args.backend == 'cuda'
  backend = vpi.Backend.CUDA
 else
  assert args.backend == 'pva'
  backend = vpi.Backend.PVA
 
 # -----------------------------
 # Open input and output videos
 
 inVideo = cv2.VideoCapture(args.input)
 
 fourcc = cv2.VideoWriter_fourcc(*'MPEG')
 inSize = (int(inVideo.get(cv2.CAP_PROP_FRAME_WIDTH)), int(inVideo.get(cv2.CAP_PROP_FRAME_HEIGHT)))
 fps = inVideo.get(cv2.CAP_PROP_FPS)
 
 outVideo = cv2.VideoWriter('klt_python'+str(sys.version_info[0])+'_'+args.backend+'.mp4',
  fourcc, fps, inSize)
 
 if not outVideo.isOpened()
  print("Error creating output video", file=sys.stderr)
  exit(1)
 
 # -----------------------------
 # Reading input bounding boxes
 
 # All boxes is a dictionary of all bounding boxes to be tracked in the input video,
 # where each value is a list of new bounding boxes to track at the frame indicated by its key
 allBoxes = {}
 totalNumBoxes = 0
 
 # Array capacity 0 means no restricted maximum number of bounding boxes
 arrayCapacity = 0
 
 if backend == vpi.Backend.PVA
  # PVA requires 128 array capacity or maximum number of bounding boxes
  arrayCapacity = 128
 
 with open(args.boxes) as f
  # The input file (f) should have one bounding box per lines as:
  # "startFrame bboxX bboxY bboxWidth bboxHeight"; e.g.: "61 547 337 14 11"
  for line in f.readlines()
  line = line.replace('\n', '').replace('\r', '')
  startFrame, x, y, w, h = [ float(v) for v in line.split(' ') ]
  bb = (x, y, w, h)
  if startFrame not in allBoxes
  allBoxes[startFrame] = [bb]
  else
  allBoxes[startFrame].append(bb)
  totalNumBoxes += 1
  if totalNumBoxes == arrayCapacity
  # Stop adding boxes if its total reached the array capacity
  break
 
 curFrame = 0
 curNumBoxes = len(allBoxes[curFrame])
 
 # ------------------------------------------------------------------------------
 # Initialize VPI array with all input bounding boxes (same as C++ KLT sample)
 
 if arrayCapacity == 0
  arrayCapacity = totalNumBoxes
 
 inBoxes = vpi.Array(arrayCapacity, vpi.Type.KLT_TRACKED_BOUNDING_BOX)
 
 inBoxes.size = totalNumBoxes
 with inBoxes.wlock_cpu()
  data = inBoxes.cpu().view(np.recarray)
 
  # Global index i of all bounding boxes data, starting at 0
  i = 0
 
  for f in sorted(allBoxes.keys())
  for bb in allBoxes[f]
  # Each bounding box bb is a tuple of (x, y, w, h)
  x, y, w, h = bb
 
  # The bounding box data is the identity for the scaling part,
  # meaning no scaling, and the offset part is its position x, y
  data[i].bbox.xform.mat3[0, 0] = 1
  data[i].bbox.xform.mat3[1, 1] = 1
  data[i].bbox.xform.mat3[2, 2] = 1
  data[i].bbox.xform.mat3[0, 2] = x
  data[i].bbox.xform.mat3[1, 2] = y
 
  # The bounding box data stores its width and height w, h
  data[i].bbox.width = w
  data[i].bbox.height = h
 
  # Initially all boxes have status tracked and update needed
  data[i].tracking_status = vpi.KLTTrackStatus.TRACKED
  data[i].template_status = vpi.KLTTemplateStatus.UPDATE_NEEDED
 
  # Incrementing the global index for the next bounding box
  i += 1
 
 #-------------------------------------------------------------------------------
 # Generate random colors for bounding boxes equal to the C++ KLT sample
 
 hues = np.zeros((totalNumBoxes,), dtype=np.uint8)
 
 if int(cv2.__version__.split('.')[0]) >= 3
  cv2.setRNGSeed(1)
  hues = cv2.randu(hues, 0, 180)
 else
  # Random differs in OpenCV-2.4
  rng = cv2.cv.RNG(1)
  hues = cv2.cv.fromarray(np.array([[ h for h in hues ]], dtype=np.uint8))
  cv2.cv.RandArr(rng, hues, cv2.cv.CV_RAND_UNI, 0, 180)
  hues = [ hues[0, i] for i in range(totalNumBoxes) ]
 
 colors = np.array([[ [int(h), 255, 255] for h in hues ]], dtype=np.uint8)
 colors = cv2.cvtColor(colors, cv2.COLOR_HSV2BGR)
 
 #-------------------------------------------------------------------------------
 # Initialize the KLT Feature Tracker algorithm
 
 # Load up first frame
 validFrame, cvFrame = inVideo.read()
 if not validFrame
  print("Error reading first input frame", file=sys.stderr)
  exit(1)
 
 # Convert OpenCV frame to gray returning also the VPI image for given backend
 cvGray, imgTemplate = convertFrameImage(cvFrame, backend)
 
 # Create the KLT Feature Tracker object using the backend specified by the user
 klt = vpi.KLTFeatureTracker(imgTemplate, inBoxes, backend=backend)
 
 #-------------------------------------------------------------------------------
 # Main processing loop
 
 while validFrame
  print('Frame: {:04d} ; Total: {:02d} boxes ;'.format(curFrame, curNumBoxes), end='')
 
  # Adjust input boxes and predictions to the current number of boxes
  inPreds = klt.in_predictions()
 
  inPreds.size = curNumBoxes
  inBoxes.size = curNumBoxes
 
  # Write current frame to the output video
  writeOutput(outVideo, cvGray, inBoxes, inPreds, colors, backend)
 
  # Read next input frame
  curFrame += 1
  validFrame, cvFrame = inVideo.read()
  if not validFrame
  break
 
  cvGray, imgReference = convertFrameImage(cvFrame, backend)
 
  outBoxes = klt(imgReference)
 
  if curFrame in allBoxes
  curNumBoxes += len(allBoxes[curFrame])
 
 outVideo.release()
 
 # vim: ts=8:sw=4:sts=4:et:ai

 #include <opencv2/core/version.hpp>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/imgproc/imgproc.hpp>
 #include <opencv2/videoio.hpp>
 #include <vpi/OpenCVInterop.hpp>
 
 #include <vpi/Array.h>
 #include <vpi/Image.h>
 #include <vpi/Status.h>
 #include <vpi/Stream.h>
 #include <vpi/algo/KLTFeatureTracker.h>
 
 #include <cstring> // for memset
 #include <fstream>
 #include <iostream>
 #include <map>
 #include <sstream>
 #include <vector>
 
 #define CHECK_STATUS(STMT) \
  do \
  { \
  VPIStatus status = (STMT); \
  if (status != VPI_SUCCESS) \
  { \
  char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \
  vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
  std::ostringstream ss; \
  ss << vpiStatusGetName(status) << ": " << buffer; \
  throw std::runtime_error(ss.str()); \
  } \
  } while (0);
 
 // 绘制边界框到图像中并保存到磁盘的实用工具。
 static cv::Mat WriteKLTBoxes(VPIImage img, VPIArray boxes, VPIArray preds)
 {
  // 将 img 转换为 cv::Mat
  cv::Mat out;
  {
  VPIImageData imgdata;
  CHECK_STATUS(vpiImageLockData(img, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
 
  assert(imgdata.bufferType == VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR);
  VPIImageBufferPitchLinear &imgPitch = imgdata.buffer.pitch;
 
  int cvtype;
  switch (imgPitch.format)
  {
  case VPI_IMAGE_FORMAT_U8
  cvtype = CV_8U;
  break;
 
  case VPI_IMAGE_FORMAT_S8
  cvtype = CV_8S;
  break;
 
  case VPI_IMAGE_FORMAT_U16
  cvtype = CV_16UC1;
  break;
 
  case VPI_IMAGE_FORMAT_S16
  cvtype = CV_16SC1;
  break;
 
  default
  throw std::runtime_error("不支持的图像类型");
  }
 
  cv::Mat cvimg(imgPitch.planes[0].height, imgPitch.planes[0].width, cvtype, imgPitch.planes[0].data,
  imgPitch.planes[0].pitchBytes);
 
  if (cvimg.type() == CV_16U)
  {
  cvimg.convertTo(out, CV_8U);
  cvimg = out;
  out = cv::Mat();
  }
 
  cvtColor(cvimg, out, cv::COLOR_GRAY2BGR);
 
  CHECK_STATUS(vpiImageUnlock(img));
  }
 
  // 现在绘制边界框。
  VPIArrayData boxdata;
  CHECK_STATUS(vpiArrayLockData(boxes, VPI_LOCK_READ, VPI_ARRAY_BUFFER_HOST_AOS, &boxdata));
 
  VPIArrayData preddata;
  CHECK_STATUS(vpiArrayLockData(preds,  
  auto *pboxes = reinterpret_cast<VPIKLTTrackedBoundingBox *>(boxdata.buffer.aos.data);
  auto *ppreds = reinterpret_cast<VPIHomographyTransform2D *>(preddata.buffer.aos.data);
 
  // Use random high-saturated colors
  static std::vector<cv::Vec3b> colors;
  if ((int)colors.size() != *boxdata.buffer.aos.sizePointer)
  {
  colors.resize(*boxdata.buffer.aos.sizePointer);
 
  cv::RNG rand(1);
  for (size_t i = 0; i < colors.size(); ++i)
  {
  colors[i] = cv::Vec3b(rand.uniform(0, 180), 255, 255);
  }
  cvtColor(colors, colors, cv::COLOR_HSV2BGR);
  }
 
  // For each tracked bounding box...
  for (int i = 0; i < *boxdata.buffer.aos.sizePointer; ++i)
  {
  if (pboxes[i].trackingStatus == 1)
  {
  continue;
  }
 
  float x, y, w, h;
  x = pboxes[i].bbox.xform.mat3[0][2] + ppreds[i].mat3[0][2];
  y = pboxes[i].bbox.xform.mat3[1][2] + ppreds[i].mat3[1][2];
  w = pboxes[i].bbox.width * pboxes[i].bbox.xform.mat3[0][0] * ppreds[i].mat3[0][0];
  h = pboxes[i].bbox.height * pboxes[i].bbox.xform.mat3[1][1] * ppreds[i].mat3[1][1];
 
  rectangle(out, cv::Rect(x, y, w, h), cv::Scalar(colors[i][0], colors[i][1], colors[i][2]), 2);
  }
 
  CHECK_STATUS(vpiArrayUnlock(preds));
  CHECK_STATUS(vpiArrayUnlock(boxes));
 
  return out;
 }
 
 int main(int argc, char *argv[])
 {
  // OpenCV image that will be wrapped by a VPIImage.
  // Define it here so that it's destroyed *after* wrapper is destroyed
  cv::Mat cvTemplate, cvReference;
 
  // Arrays that will store our input bboxes and predicted transform.
  VPIArray inputBoxList = NULL, inputPredList = NULL;
 
  // Other VPI objects that will be used
  VPIStream stream = NULL;
  VPIArray outputBoxList = NULL;
  VPIArray outputEstimList = NULL;
  VPIPayload klt = NULL;
  VPIImage imgReference = NULL;
  VPIImage imgTemplate = NULL;
 
  int retval = 0;
  try
  {
  if (argc != 4)
  {
  throw std::runtime_error(std::string("Usage: ") + argv[0] + " <cpu|pva|cuda> <input_video> <bbox descr>");
  }
 
  std::string strBackend = argv[1];
  std::string strInputVideo = argv[2];
  std::string strInputBBoxes = argv[3];
 
  // Load the input video
  cv::VideoCapture invid;
  if (!invid.open(strInputVideo))
  {
  throw std::runtime_error("Can't open '" + strInputVideo + "'");
  }
 
  // Open the output video for writing using input's characteristics
  int w = invid.get(cv::CAP_PROP_FRAME_WIDTH);
  int h = invid.get(cv::CAP_PROP_FRAME_HEIGHT);
  int fourcc = cv::VideoWriter::fourcc('M', 'P', 'E', 'G');
  double fps = invid.get(cv::CAP_PROP_FPS);
 
  cv::VideoWriter outVideo("klt_" + strBackend + ".mp4", fourcc, fps, cv::Size(w, h));
  if (!outVideo.isOpened())
  {
  throw std::runtime_error("Can't create output video");
  }
 
  // Load the bounding boxes
  // Format is: <frame number> <bbox_x> <bbox_y> <bbox_width> <bbox_height>
  // Important assumption: bboxes must be sorted with increasing frame numbers.
 
  // These arrays will actually wrap these vectors.
  std::vector<VPIKLTTrackedBoundingBox> bboxes;
  int32_t bboxesSize = 0;
  std::vector<VPIHomographyTransform2D> preds;
  int32_t predsSize = 0;
 
  // Stores how many bboxes there are in each frame. Only
  // stores when the bboxes count change.
  std::map<int, size_t> bboxes_size_at_frame; // frame -> bbox count
 
  // PVA requires that array capacity is 128.
  bboxes.reserve(128);
  preds.reserve(128);
 
  // Read bounding boxes
  {
  std::ifstream in(strInputBBoxes);
  if (!in)
  {
  throw std::runtime_error("Can't open '" + strInputBBoxes + "'");
  }
 
  // For each bounding box,
  int frame, x, y, w, h;
  while (in >> frame >> x >> y >> w >> h)
  {
  if (bboxes.size() == 64)
  {
  throw std::runtime_error("Too many bounding boxes");
  }
 
  // Convert the axis-aligned bounding box into our tracking
  // structure.
 
  VPIKLTTrackedBoundingBox track = {};
  // scale
  track.bbox.xform.mat3[0][0] = 1;
  track.bbox.xform.mat3[1][1] = 1;
  // position
  track.bbox.xform.mat3[0][2] = x;
  track.bbox.xform.mat3[1][2] = y;
  // must be 1
  track.bbox.xform.mat3[2][2] = 1;
 
  track.bbox.width = w;
  track.bbox.height = h;
  track.trackingStatus = 0; // valid tracking
  track.templateStatus = 1; // must update
 
  bboxes.push_back(track);
 
  // Identity predicted transform.
  VPIHomographyTransform2D xform = {};
  xform.mat3[0][0] = 1;
  xform.mat3[1][1] = 1;
  xform.mat3[2][2] = 1;
  preds.push_back(xform);
 
  bboxes_size_at_frame[frame] = bboxes.size();
  }
 
  if (!in && !in.eof())
  {
  throw std::runtime_error("Can't parse bounding boxes, stopped at bbox #" +
  std::to_string(bboxes.size()));
  }
 
  // Wrap the input arrays into VPIArray's
  VPIArrayData data = {};
  data.bufferType = VPI_ARRAY_BUFFER_HOST_AOS;
  data.buffer.aos.type = VPI_ARRAY_TYPE_KLT_TRACKED_BOUNDING_BOX;
  data.buffer.aos.capacity = bboxes.capacity();
  data.buffer.aos.sizePointer = &bboxesSize;
  data.buffer.aos.data = &bboxes[0];
  CHECK_STATUS(vpiArrayCreateWrapper(&data, 0, &inputBoxList));
 
  data.buffer.aos.type = VPI_ARRAY_TYPE_HOMOGRAPHY_TRANSFORM_2D;
  data.buffer.aos.sizePointer = &predsSize;
  data.buffer.aos.data = &preds[0];
  CHECK_STATUS(vpiArrayCreateWrapper(&data, 0, &inputPredList));
  }
 
  // Now parse the backend
  VPIBackend backend;
 
  if (strBackend == "cpu")
  {
  backend = VPI_BACKEND_CPU;
  }
  else if (strBackend == "cuda")
  {
  backend = VPI_BACKEND_CUDA;
  }
  else if (strBackend == "pva")
  {
  backend = VPI_BACKEND_PVA;
  }
  else
  {
  throw std::runtime_error("Backend '" + strBackend +
  "' not recognized, it must be either cpu, cuda or pva.");
  }
 
  // Create the stream for the given backend.
  CHECK_STATUS(vpiStreamCreate(backend, &stream));
 
  // Helper function to fetch a frame from input
  int nextFrame = 0;
  auto fetchFrame = [&invid, &nextFrame, backend]() {
  cv::Mat frame;
  if (!invid.read(frame))
  {
  return cv::Mat();
  }
 
  // We only support grayscale inputs
  if (frame.channels() == 3)
  {
  cvtColor(frame, frame, cv::COLOR_BGR2GRAY);
  }
 
  if (backend == VPI_BACKEND_PVA)
  {
  // PVA only supports 16-bit unsigned inputs,
  // where each element is in 0-255 range, so
  // no rescaling needed.
  cv::Mat aux;
  frame.convertTo(aux, CV_16U);
  frame = aux;
  }
  else
  {
  assert(frame.type() == CV_8U);
  }
 
  ++nextFrame;
  return frame;
  };
 
  // 获取第一帧并将其包装到 VPIImage 中。
  // 模板将基于此帧。
  cvTemplate = fetchFrame();
  CHECK_STATUS(vpiImageCreateWrapperOpenCVMat(cvTemplate, 0, &imgTemplate));
 
  // 创建参考图像包装器。现在我们先包装 cvTemplate，
  // 仅用于创建包装器。稍后我们将其设置为包装实际的参考图像。
  CHECK_STATUS(vpiImageCreateWrapperOpenCVMat(cvTemplate, 0, &imgReference));
 
  VPIImageFormat imgFormat;
  CHECK_STATUS(vpiImageGetFormat(imgTemplate, &imgFormat));
 
  // 使用第一帧的特性，创建一个 KLT 边界框跟踪器负载。
  // 我们将模板尺寸限制为 64x64。
  CHECK_STATUS(vpiCreateKLTFeatureTracker(backend, cvTemplate.cols, cvTemplate.rows, imgFormat, NULL, &klt));
 
  // 我们将使用的参数。无需动态更改它们，因此只需在此处定义它们。
  VPIKLTFeatureTrackerParams params;
  CHECK_STATUS(vpiInitKLTFeatureTrackerParams(&params));
 
  // 带有当前帧估计边界框的输出数组。
  CHECK_STATUS(vpiArrayCreate(128, VPI_ARRAY_TYPE_KLT_TRACKED_BOUNDING_BOX, 0, &outputBoxList));
 
  // 带有估计的输入边界框变换的输出数组，以匹配输出边界框。
  CHECK_STATUS(vpiArrayCreate(128, VPI_ARRAY_TYPE_HOMOGRAPHY_TRANSFORM_2D, 0, &outputEstimList));
 
  size_t curNumBoxes = 0;
 
  do
  {
  size_t curFrame = nextFrame - 1;
 
  // 获取当前帧中边界框的数量。
  auto tmp = --bboxes_size_at_frame.upper_bound(curFrame);
  size_t bbox_count = tmp->second;
 
  assert(bbox_count >= curNumBoxes && "输入边界框必须按帧排序");
 
  // 当前帧是否有新的边界框？
  if (curNumBoxes != bbox_count)
  {
  // 更新输入数组大小，新帧已在那里，因为我们已经填充了
  // 这些数组包含所有输入边界框。
  CHECK_STATUS(vpiArraySetSize(inputBoxList, bbox_count));
  CHECK_STATUS(vpiArraySetSize(inputPredList, bbox_count));
 
  for (size_t i = 0; i < bbox_count - curNumBoxes; ++i)
  {
  std::cout << curFrame << " -> new " << curNumBoxes + i << std::endl;
  }
  assert(bbox_count <= bboxes.capacity());
  assert(bbox_count <= preds.capacity());
 
  curNumBoxes = bbox_count;
  }
 
  // 将此帧保存到磁盘。
  outVideo << WriteKLTBoxes(imgTemplate, inputBoxList, inputPredList);
 
  // 获取新帧
  cvReference = fetchFrame();
 
  // 视频结束了吗？
  if (cvReference.data == NULL)
  {
  // 优雅地结束。
  break;
  }
 
  // 使参考包装器指向参考帧
  CHECK_STATUS(vpiImageSetWrappedOpenCVMat(imgReference, cvReference));
 
  // 估计当前帧（参考帧）中的边界框，给定它们在前一帧（模板帧）中的位置。
  // 帧（模板）。
  CHECK_STATUS(vpiSubmitKLTFeatureTracker(stream, backend, klt, imgTemplate, inputBoxList, inputPredList,
  imgReference, outputBoxList, outputEstimList, &params));
 
  // 等待处理完成。
  CHECK_STATUS(vpiStreamSync(stream));
 
  // 现在输入和输出数组被锁定，以正确设置下一次迭代的输入。
  // 输入数组将根据本次迭代中生成的跟踪信息进行更新。
  VPIArrayData updatedBBoxData;
  CHECK_STATUS(vpiArrayLockData(outputBoxList, VPI_LOCK_READ, VPI_ARRAY_BUFFER_HOST_AOS, &updatedBBoxData));
 
  VPIArrayData estimData;
  CHECK_STATUS(vpiArrayLockData(outputEstimList, VPI_LOCK_READ, VPI_ARRAY_BUFFER_HOST_AOS, &estimData));
 
  // 由于这些数组实际上是外部数据的包装器，因此我们不需要检索
  // VPI 数组内容，包装的缓冲区将直接更新。数组必须
  // 无论如何都要锁定以进行读/写。
  CHECK_STATUS(vpiArrayLock(inputBoxList, VPI_LOCK_READ_WRITE));
  CHECK_STATUS(vpiArrayLock(inputPredList, VPI_LOCK_READ_WRITE));
 
  auto *updated_bbox = reinterpret_cast<VPIKLTTrackedBoundingBox *>(updatedBBoxData.buffer.aos.data);
  auto *estim = reinterpret_cast<VPIHomographyTransform2D *>(estimData.buffer.aos.data);
 
  // 对于每个边界框，
  for (size_t b = 0; b < curNumBoxes; ++b)
  {
  // 跟踪失败了吗？
  if (updated_bbox[b].trackingStatus)
  {
  // 我们是否也必须更新输入边界框的跟踪状态？
  if (bboxes[b].trackingStatus == 0)
  {
  std::cout << curFrame << " -> dropped " << b << std::endl;
  bboxes[b].trackingStatus = 1;
  }
 
  continue;
  }
 
  // 必须更新此边界框的模板吗？
  if (updated_bbox[b].templateStatus)
  {
  std::cout << curFrame << " -> update " << b << std::endl;
 
  // 这里通常有两种方法：
  // 1. 使用特征检测器（例如
  // \ref algo_harris_corners "Harris 角点检测器"）重新定义边界框，或者
  // 2. 使用 updated_bbox[b]，它仍然有效，尽管跟踪
  // 错误可能会随着时间累积。
  //
  // 我们将选择第二种方案，鲁棒性较差，但足够简单
  // 以实现。
  bboxes[b] = updated_bbox[b];
 
  // 发出信号通知输入，必须更新此边界框的模板。
  bboxes[b].templateStatus = 1;
 
  // 预测的变换现在是恒等变换，因为我们重置了跟踪。
  preds[b] = VPIHomographyTransform2D{};
  preds[b].mat3[0][0] = 1;
  preds[b].mat3[1][1] = 1;
  preds[b].mat3[2][2] = 1;
  }
  else
  {
  // 通知输入，此边界框的模板不需要更新。
  bboxes[b].templateStatus = 0;
 
  // 我们只需使用估计的变换来更新输入变换。
  preds[b] = estim[b];
  }
  }
 
  // 我们已经完成对输入和输出数组的操作。
  CHECK_STATUS(vpiArrayUnlock(inputBoxList));
  CHECK_STATUS(vpiArrayUnlock(inputPredList));
 
  CHECK_STATUS(vpiArrayUnlock(outputBoxList));
  CHECK_STATUS(vpiArrayUnlock(outputEstimList));
 
  // 下一帧的参考帧是当前帧的模板。
  std::swap(imgTemplate, imgReference);
  std::swap(cvTemplate, cvReference);
  } while (true);
  }
  catch (std::exception &e)
  {
  std::cerr << e.what() << std::endl;
  retval = 1;
  }
 
  vpiStreamDestroy(stream);
  vpiPayloadDestroy(klt);
  vpiArrayDestroy(inputBoxList);
  vpiArrayDestroy(inputPredList);
  vpiArrayDestroy(outputBoxList);
  vpiArrayDestroy(outputEstimList);
  vpiImageDestroy(imgReference);
  vpiImageDestroy(imgTemplate);
 
  return retval;
 }

VPI - Vision Programming Interface

3.2 Release

概述

说明

结果

源代码