OpenCV之GOTURN目标追踪

💂 个人主页:风间琉璃
🤟 版权: 本文由【风间琉璃】原创、在CSDN首发、需要转载请联系博主
💬 如果文章对你有帮助、欢迎关注、点赞、收藏(一键三连)和订阅专栏哦

前言

GOTURN（Generic Object Tracking Using Regression Networks）是一种用于目标跟踪的计算机视觉算法，它使用回归神经网络来实现实时目标跟踪。GOTURN的目标是通过检测并跟踪特定物体，使其能够在视频序列中保持物体的连续性，即使物体发生尺寸变化、遮挡或平移。达到了Tracking中效果上的state-of-the-art，尤其在检测速度上达到了100FPS（第一个达到100FPS的深度学习方法）。

一、goturn简介

goturn整个算法的框架其实非常简单：输入当前帧和前一帧进入网络，输出当前帧bounding-box的位置。

以前一帧的目标区域为中心扩展，并crop出来。也就是说：在第t−1帧，Tracker预测的bounding-box位置为c=(cx，xy)，宽和高分别为w,h，crop出来的框大小为k1*w，k1*h，k1决定接受多少背景信息。

对于当前帧，也就是第t帧，基于上一帧的位置，找到待搜寻目标的区域,即search region，网络的目的就是要回归目标在当前search region中的location。这里设置search region的中心坐标为c'=(cx',xy')=c，和前一帧框出来的区域是一样的，search region的大小为k2*w,k2*ℎ，w,ℎ是第t−1帧bounding-box的大小，设置k1=k2=2，对于快速移动的目标，k1,k2就需要增大了。

网络结构：

在当前帧和前一帧分别crop出region之后，送入网络进行feature extraction（CaffeNet的卷积层），将这些feature级联并输入fully-connected层，fc层的目的是为了比较object的特征和当前帧的特征，以找到object被移动到了哪里。fc层学习到的是一个复杂的特征比较函数，输出目标的相对运动。

随后fc层的输出被连接到一个4节点的层（分别代表bounding-box两个角的坐标），以输出object的位置。

二、预处理

图像预处理：对目标区域和搜索区域进行了预处理，包括调整大小和减去均值。这里因为是对视频流操作，因此需要对每一帧图像都进行预处理。


//图像预处理：调整尺寸和减去均值
resize(targetPatch, targetPatch, Size(width, height));
resize(searchPatch, searchPatch, Size(width, height));
 
targetPatch = targetPatch - meanval;
searchPatch = searchPatch - meanval;
 
//转换为浮点数
targetPatch.convertTo(targetPatch, CV_32F);
searchPatch.convertTo(searchPatch, CV_32F);
 
Mat targetBlob = blobFromImage(targetPatch);
Mat searchBlob = blobFromImage(searchPatch);

三、模型加载

数据预处理之后，就可以加载模型进行预测了。


//加载网络模型
net = readNetFromCaffe(goturn_config, goturn_model);

四、执行推理

这里使用了cuda进行加速处理，如果没有cuda，注释即可。


net.setInput(targetBlob, "data1");
net.setInput(searchBlob, "data2");
 
//使用cuda加速
net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA_FP16);
 
Mat res = net.forward("scale");
Mat resMat = res.reshape(1, 1);

五、解析输出

网络预测结果存储在res中，我们需要对其进行分析获取我们想要的数据，我们得到的是bounding-box两个对角的坐标。


curBB.x = targetPatchRect.x + (resMat.at<float>(0) * targetPatchRect.width / width) - targetPatchRect.width;
curBB.y = targetPatchRect.y + (resMat.at<float>(1) * targetPatchRect.height / height) - targetPatchRect.height;
curBB.width = (resMat.at<float>(2) - resMat.at<float>(0)) * targetPatchRect.width / width;
curBB.height = (resMat.at<float>(3) - resMat.at<float>(1)) * targetPatchRect.height / height;
 
if (curBB.width > 300)
	curBB.width = curBB.width / 4;
if (curBB.height > 300)
	curBB.height = curBB.height / 4;
 
//Predicted BB
Rect boundingBox = curBB;

最后的判断是由于不知道为什么在使用cuda进行加速时，到最后预测框会越来越大，帧率越来越低，只能强制限制范围，追踪效果改善了不少。如果觉得多余也可以注释。

运行结果：在视频开始前需要用鼠标在视频中确定追踪的目标，然后按下空格或者enter进行追踪。

1694681842016

源码：下载：OpenCVGoturn目标追踪资源-CSDN文库


// Goturn_Object_Track.cpp : 此文件包含 "main" 函数。程序执行将在此处开始并结束。
//
 
#include 
#include 
#include 
#include 
 
 
using namespace cv;
using namespace cv::dnn;
using namespace std;
 
String  goturn_model = "F:/data/CQU/VS/Goturn_Object_Track/goturn.caffemodel";
String goturn_config = "F:/data/CQU/VS/Goturn_Object_Track/goturn.prototxt";
 
 
Net net;
//当前帧和前一帧
Mat frame, prevFrame;
//前一帧的bounding-box的坐标
Rect prevBB;
 
 
//追踪物体矩形框
Rect trackObjects(Mat& frame, Mat& prevFrame)
{
	Rect rect;
	int width = 227;  //输入图像宽
	int height = 227; //输入图像高
	int meanval = 128;
 
	//思路：使用前一帧和前一帧的前一帧的bounding-box的坐标以及当前帧计算预测当前帧的bounding-box坐标
	Mat curFrame = frame.clone();
	//要预测输出的当前一帧的前一帧的bounding-box的坐标
	Rect2d curBB;
 
	float padTargetPatch = 2.0;
	Rect2f searchPatchRect, targetPatchRect;
	Point2f currCenter, prevCenter;
	Mat prevFramePadded, curFramePadded;
	Mat searchPatch, targetPatch;
 
	//前一帧bounding-box中心坐标
	prevCenter.x = (float)(prevBB.x + prevBB.width / 2);
	prevCenter.y = (float)(prevBB.y + prevBB.height / 2);
 
	targetPatchRect.width = (float)(prevBB.width * padTargetPatch);
	targetPatchRect.height = (float)(prevBB.height * padTargetPatch);
	targetPatchRect.x = (float)(prevCenter.x - prevBB.width * padTargetPatch / 2.0 + targetPatchRect.width);
	targetPatchRect.y = (float)(prevCenter.y - prevBB.height * padTargetPatch / 2.0 + targetPatchRect.height);
 
	copyMakeBorder(prevFrame, prevFramePadded, (int)targetPatchRect.height, (int)targetPatchRect.height, (int)targetPatchRect.width, (int)targetPatchRect.width, BORDER_REPLICATE);
	targetPatch = prevFramePadded(targetPatchRect).clone();
 
	copyMakeBorder(curFrame, curFramePadded, (int)targetPatchRect.height, (int)targetPatchRect.height, (int)targetPatchRect.width, (int)targetPatchRect.width, BORDER_REPLICATE);
	searchPatch = curFramePadded(targetPatchRect).clone();
 
	//图像预处理：调整尺寸和减去均值
	resize(targetPatch, targetPatch, Size(width, height));
	resize(searchPatch, searchPatch, Size(width, height));
 
	targetPatch = targetPatch - meanval;
	searchPatch = searchPatch - meanval;
 
	//转换为浮点数
	targetPatch.convertTo(targetPatch, CV_32F);
	searchPatch.convertTo(searchPatch, CV_32F);
 
	Mat targetBlob = blobFromImage(targetPatch);
	Mat searchBlob = blobFromImage(searchPatch);
 
	net.setInput(targetBlob, "data1");
	net.setInput(searchBlob, "data2");
 
	//使用cuda加速
	net.setPreferableBackend(cv::dnn::DNN_BACKEND_CUDA);
	net.setPreferableTarget(cv::dnn::DNN_TARGET_CUDA_FP16);
 
	Mat res = net.forward("scale");
	Mat resMat = res.reshape(1, 1);
 
	//printf("width : %d, height : %d\n", (resMat.at(2) - resMat.at(0)), (resMat.at(3) - resMat.at(1)));
 
	curBB.x = targetPatchRect.x + (resMat.at<float>(0) * targetPatchRect.width / width) - targetPatchRect.width;
	curBB.y = targetPatchRect.y + (resMat.at<float>(1) * targetPatchRect.height / height) - targetPatchRect.height;
	curBB.width = (resMat.at<float>(2) - resMat.at<float>(0)) * targetPatchRect.width / width;
	curBB.height = (resMat.at<float>(3) - resMat.at<float>(1)) * targetPatchRect.height / height;
 
	if (curBB.width > 300)
		curBB.width = curBB.width / 4;
	if (curBB.height > 300)
		curBB.height = curBB.height / 4;
 
	//Predicted BB
	Rect boundingBox = curBB;
	return boundingBox;
}
 
 
int main()
{
	//打开视频
	VideoCapture capture("cap.mp4");
	capture.read(frame);
	//加载网络模型
	net = readNetFromCaffe(goturn_config, goturn_model);
	
 
	frame.copyTo(prevFrame);
	//选取感兴趣区域
	prevBB = selectROI(frame, true, true);
	namedWindow("frame", CV_WINDOW_AUTOSIZE);
	//读取视频每一帧,并预测
	while(capture.read(frame)) 
	{
	
		//获得当前系统的计时间周期数,求FPS
		double t = (double)getTickCount();
 
		//预测
		Rect currentBB = trackObjects(frame, prevFrame);
		rectangle(frame, currentBB, Scalar(0, 255, 0), 2, 8, 0);
 
		// 准备下一帧
		frame.copyTo(prevFrame);
		prevBB.x = currentBB.x;
		prevBB.y = currentBB.y;
		prevBB.width = currentBB.width;
		prevBB.height = currentBB.height;
 
		//FPS计算
		t = ((double)getTickCount() - t) / getTickFrequency();//求输入帧后经过的周期数/每秒系统计的周期数=一帧用时多少秒
		int fps = 1.0 / t;//求倒数得到每秒经过多少帧，即帧率
		string text = "FPS:" + to_string(fps);
		cv::putText(frame, text, Point(10, 50), FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0), 2, 8, 0);
 
 
		imshow("frame", frame);
		char c = waitKey(5);
		if(c == 27) 
		{
			break;
		}
	}	
	return 0;
}

结束语
感谢你观看我的文章呐~本次航班到这里就结束啦 🛬

希望本篇文章有对你带来帮助 🎉，有学习到一点知识~

躲起来的星星🍥也在努力发光，你也要努力加油（让我们一起努力叭）。

最后，博主要一下你们的三连呀（点赞、评论、收藏），不要钱的还是可以搞一搞的嘛~

不知道评论啥的，即使扣个666也是对博主的鼓舞吖 💞 感谢 💐

相关阅读:
理论第七课——sort
地址解析协议ARP
乔布斯时代的“老人”，一个个都离开苹果了
 6.1、Flink数据写入到文件
 基于AI智能分析网关的智慧视频监控系统一站式解决方案
 开发实战经验分享：互联网医院系统源码与在线问诊APP搭建
 【Paddle】稀疏计算的使用指南 & 稀疏ResNet的学习心得 (2) + Paddle3D应用实例稀疏 ResNet代码解读（1.6w字超详细）
kotlin修饰符const的含义
 用户运营，如何多方面拉新？
RabbitMQ
原文地址：https://blog.csdn.net/qq_53144843/article/details/132881730