我有一个程序,可以从连接到 jetson nano 的 CSI 摄像机捕获实时视频,并估计 aruco 标记的姿势。然而,当我在现实生活中移动 aruco 标记时,我的代码返回的位置仅在大约 0.5s-1s 后发生变化。我对这一切都很陌生,我不知道瓶颈在哪里。
我的代码如下。我测量了完成分析帧的主循环所需的平均时间,大约为 50-60 毫秒,所以我不认为检测和姿态估计本身就是问题。
我评论了
cv::imshow("Image", image);
(看看延迟是否来自于此),但延迟的变化很小甚至没有。由于不再显示窗口,因此cv::waitKey(1) == 'q'
将不再触发,因此我添加了total_time >= 30
作为不同的条件。据我了解,无论我在cv::waitKey
中设置什么时间,程序都会在继续之前等待设定的时间。考虑到这一点,我将该行更改为cv::waitKey(1000) == 'q'
,以便程序必须等待一秒钟才能分析下一帧(实际上将 fps 设置为 1)以查看延迟会是多少。好吧,现实世界中的任何运动大约需要 12 秒才能在输出中看到任何变化(我取消注释 cv::imshow("Image", image);
以查看正在分析哪些帧)。
我以为当
cv::waitKey
中的时间结束时,将捕获当前帧,但似乎它要么由于某种原因捕获旧帧,要么视频捕获本身有很大的延迟。
任何人都可以帮我找出问题所在以及任何可能的解决方法吗?预先感谢。
#include <opencv2/opencv.hpp>
#include <opencv2/aruco.hpp>
#include <opencv2/objdetect/aruco_detector.hpp>
#include <opencv2/objdetect/aruco_dictionary.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
std::string gstreamer_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method, int sensor_mode)
{
return "nvarguscamerasrc sensor_mode=" + std::to_string(sensor_mode) + " ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + ", height=(int)" +
std::to_string(capture_height) + ", framerate=(fraction)" + std::to_string(framerate) +
"/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + " ! video/x-raw, width=(int)" + std::to_string(display_width) + ", height=(int)" +
std::to_string(display_height) + ", format=(string)GRAY8 ! videoconvert ! video/x-raw, format=(string)GRAY8 ! appsink";
}
bool loadCameraCalibration(const std::string &filename, cv::Mat &cameraMatrix, cv::Mat &distCoeffs)
{
cv::FileStorage fs("calibration_params.xml", cv::FileStorage::READ);
if (!fs.isOpened())
{
return false;
}
fs["Camera_Matrix"] >> cameraMatrix;
fs["Distorsion_Coefficients"] >> distCoeffs;
fs.release();
return true;
}
void detectCorners(const cv::Mat &image, const cv::aruco::ArucoDetector &detector, std::vector<int> &ids, std::vector<std::vector<cv::Point2f>> &corners)
{
// Image is already grayscale.
// Detect the markers in the grayscale image.
std::vector<std::vector<cv::Point2f>> rejectedCorners;
detector.detectMarkers(image, corners, ids);
// Refine the corners using cornerSubPix.
for (auto &corner : corners)
{
cv::cornerSubPix(image, corner, cv::Size(3, 3), cv::Size(-1, -1), cv::TermCriteria(cv::TermCriteria::EPS + cv::TermCriteria::COUNT, 30, 0.01));
}
}
int main()
{
// Set camera parameters
int capture_width = 1640;
int capture_height = 1232;
int display_width = 1640;
int display_height = 1232;
int framerate = 30;
int flip_method = 2;
int sensor_mode = 3;
std::string pipeline = gstreamer_pipeline(capture_width, capture_height, display_width, display_height, framerate, flip_method, sensor_mode);
std::cout << "Using pipeline: \n\t" << pipeline << "\n";
// Load the camera matrix and distortion coefficients from the XML file.
cv::Mat cameraMatrix, distCoeffs;
if (!loadCameraCalibration("calibration_params.xml", cameraMatrix, distCoeffs))
{
std::cout << "Failed to load camera calibration parameters." << std::endl;
return -1;
}
// Set the marker length.
float markerLength = 0.10;
// Create a 4x1 coordinate system for the marker.
cv::Mat objPoints(4, 1, CV_32FC3);
objPoints.ptr<cv::Vec3f>(0)[0] = cv::Vec3f(-markerLength / 2.f, markerLength / 2.f, 0);
objPoints.ptr<cv::Vec3f>(0)[1] = cv::Vec3f(markerLength / 2.f, markerLength / 2.f, 0);
objPoints.ptr<cv::Vec3f>(0)[2] = cv::Vec3f(markerLength / 2.f, -markerLength / 2.f, 0);
objPoints.ptr<cv::Vec3f>(0)[3] = cv::Vec3f(-markerLength / 2.f, -markerLength / 2.f, 0);
// Create a detector and dictionary.
cv::aruco::DetectorParameters detectorParams = cv::aruco::DetectorParameters();
cv::aruco::Dictionary dictionary = cv::aruco::getPredefinedDictionary(cv::aruco::DICT_4X4_50);
cv::aruco::ArucoDetector detector(dictionary, detectorParams);
// Create a VideoCapture object to grab frames from the camera.
cv::VideoCapture cap(pipeline, cv::CAP_GSTREAMER);
if (!cap.isOpened())
{
std::cout << "Failed to open camera." << std::endl;
return -1;
}
double total_time = 0;
int count_loop = 0;
// Main loop to grab frames and detect markers.
while (true)
{
count_loop += 1;
// Start timer
double start_time = static_cast<double>(cv::getTickCount());
cv::Mat image;
if (!cap.read(image))
{
std::cout << "Capture read error" << std::endl;
break;
}
// Detect the markers in the image.
std::vector<int> ids;
std::vector<std::vector<cv::Point2f>> corners;
detectCorners(image, detector, ids, corners);
// If at least one marker was detected, draw it and its pose.
if (ids.size() > 0)
{
cv::aruco::drawDetectedMarkers(image, corners, ids);
// Initialize rvecs and tvecs.
std::vector<cv::Vec3d> rvecs(ids.size());
std::vector<cv::Vec3d> tvecs(ids.size());
// Calculate the pose of each marker.
for (int i = 0; i < ids.size(); i++)
{
cv::solvePnP(objPoints, corners[i], cameraMatrix, distCoeffs, rvecs[i], tvecs[i]);
// One thing to note is that solvePnP returns the world coordinates relative to the camera
}
for (int j = 0; j < ids.size(); j++)
{
// Check if the current marker has ID 13
if (ids[j] == 13)
{
// Extract x, y, z from the translation vector (tvec).
double x = tvecs[j][0];
double y = tvecs[j][1];
double z = tvecs[j][2];
// Calculate yaw, pitch, and roll from the rotation vector (rvec).
cv::Mat rotMat;
cv::Rodrigues(rvecs[j], rotMat);
double yaw, pitch, roll;
// https://stackoverflow.com/questions/11514063/extract-yaw-pitch-and-roll-from-a-rotationmatrix
/*
Since Opencv uses the pinhole camera model(z-looking out of the camera,
x-to the right, y- down),yaw pitch and roll arent the same. Here, yaw will be the
rotation around the y axis, pitch the rotation around the x axis and roll the
rotation around the z axis
Normally, we would calculate like this:
// Calculate yaw (around z-axis).
yaw = atan2(rotMat.at<double>(1, 0), rotMat.at<double>(0, 0));
// Calculate pitch (around y-axis).
pitch = atan2(-rotMat.at<double>(2, 0), std::sqrt(rotMat.at<double>(2, 1) * rotMat.at<double>(2, 1) + rotMat.at<double>(2, 2) * rotMat.at<double>(2, 2)));
// Calculate roll (around x-axis).
roll = atan2(-rotMat.at<double>(2, 1), rotMat.at<double>(2, 2));
But we must change some of these to be coherent with the pinhole model:
roll becomes pitch, pitch becomes yaw and yaw becomes roll
*/
// Using pinhole camera model:
// Calculate roll (around z-axis).
roll = atan2(rotMat.at<double>(1, 0), rotMat.at<double>(0, 0));
// Calculate yaw (around y-axis).
yaw = atan2(-rotMat.at<double>(2, 0), std::sqrt(rotMat.at<double>(2, 1) * rotMat.at<double>(2, 1) + rotMat.at<double>(2, 2) * rotMat.at<double>(2, 2)));
// Calculate picth (around x-axis).
pitch = atan2(-rotMat.at<double>(2, 1), rotMat.at<double>(2, 2));
// Convert yaw, pitch, and roll to degrees.
double yaw_degrees = yaw * (180.0 / CV_PI);
double pitch_degrees = pitch * (180.0 / CV_PI);
double roll_degrees = roll * (180.0 / CV_PI);
//Print the measurements
std::cout << "x: " << x << std::endl;
std::cout << "y: " << y << std::endl;
std::cout << "z: " << z << std::endl;
std::cout << "yaw: " << yaw_degrees << std::endl;
std::cout << "roll: " << roll_degrees << std::endl;
std::cout << "pitch: " << pitch_degrees << std::endl;
}
}
// Draw the pose of each marker.
// Since I am no longer showing the image, this part of the is no longer needed so I commented it
/*
for (int i = 0; i < ids.size(); i++)
{
cv::drawFrameAxes(image, cameraMatrix, distCoeffs, rvecs[i], tvecs[i], 0.1);
}
*/
}
// Show the image with detected markers.
// cv::imshow("Image", image);
// Calculate elapsed time in seconds
double elapsed_time = (static_cast<double>(cv::getTickCount()) - start_time) / cv::getTickFrequency();
std::cout << "Elapsed Time: " << elapsed_time << " seconds" << std::endl;
total_time += elapsed_time;
// Break the loop if 'q' key is pressed.
if ((cv::waitKey(1) == 'q') || (total_time >= 30))
{
break;
}
}
//Calculate average time for each iteration of the loop
std::cout << "Average time= " << total_time / count_loop << std::endl;
cap.release();
cv::destroyAllWindows();
return 0;
}
编辑:问题似乎不是来自 aruco 逻辑本身。如果没有 aruco 逻辑,延迟会稍微小一些(例如,当我将
cv::waitKey()
中的时间设置为 1 毫秒时,延迟约为 0.5 秒,可能会少一点。我可以看出问题仍然存在,无需aruco 逻辑,因为当我将 cv::waitKey()
中的时间更改为 1000 毫秒时,大约 12 秒(在本示例中,如果删除 aruco 逻辑产生了影响,我无法注意到它)。这是更新后的代码:
#include <opencv2/opencv.hpp>
std::string gstreamer_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method, int sensor_mode)
{
return "nvarguscamerasrc sensor_mode=" + std::to_string(sensor_mode) + " ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + ", height=(int)" +
std::to_string(capture_height) + ", framerate=(fraction)" + std::to_string(framerate) +
"/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + " ! video/x-raw, width=(int)" + std::to_string(display_width) + ", height=(int)" +
std::to_string(display_height) + ", format=(string)GRAY8 ! videoconvert ! video/x-raw, format=(string)GRAY8 ! appsink";
}
int main()
{
// Set camera parameters
int capture_width = 1640;
int capture_height = 1232;
int framerate = 30;
int flip_method = 2;
int sensor_mode = 3;
std::string pipeline = gstreamer_pipeline(capture_width, capture_height, capture_width, capture_height, framerate, flip_method, sensor_mode);
std::cout << "Using pipeline: \n\t" << pipeline << "\n";
// Create a VideoCapture object to grab frames from the camera.
cv::VideoCapture cap(pipeline, cv::CAP_GSTREAMER);
if (!cap.isOpened())
{
std::cout << "Failed to open camera." << std::endl;
return -1;
}
cv::namedWindow("Image", cv::WINDOW_NORMAL); // Create a window for displaying the image
// Main loop to grab frames, display them, and check for user input.
while (true)
{
cv::Mat image;
if (!cap.read(image))
{
std::cout << "Capture read error" << std::endl;
break;
}
// Display the captured image
cv::imshow("Image", image);
// Check if "q" key is pressed using cv::waitKey
if (cv::waitKey(1) == 'q')
{
break;
}
}
cap.release();
cv::destroyAllWindows();
return 0;
}
编辑2:
如果我将
cv::waitKey
中的时间设置为 2000 毫秒,则图像显示需要相同的 12 帧。也许某个地方有一些缓冲区导致了这种延迟?
我尝试在 cap.set(cv::CAP_PROP_BUFFERSIZE, 1);
下面使用 cv::VideoCapture cap(pipeline, cv::CAP_GSTREAMER);
但这并没有改变任何东西。
我遵循@pptaszni(https://stackoverflow.com/users/4165552/pptaszni)的建议,更改协议以丢帧,并且延迟已减少到完全可接受的水平,这正是我所期望的。我确实没有可靠的方法来测量它,但如果我不得不猜测它可能大约在 200ms 左右,包括所有 aruco 逻辑。这是更新后的管道函数:
std::string gstreamer_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method, int sensor_mode)
{
return "nvarguscamerasrc sensor_mode=" + std::to_string(sensor_mode) + " ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + ", height=(int)" +
std::to_string(capture_height) + ", framerate=(fraction)" + std::to_string(framerate) +
"/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + " ! video/x-raw, width=(int)" + std::to_string(display_width) + ", height=(int)" +
std::to_string(display_height) + ", format=(string)GRAY8 ! videoconvert ! video/x-raw, format=(string)GRAY8 ! appsink drop=true sync=false";
}
谢谢大家!