我正在尝试从 yolov8 中进行推断,以在 coco 数据集上训练目标检测。基本上没有这方面的官方文档,但我尝试根据提供的少量示例代码即兴创作。但几乎所有关于输出的内容都是错误的,从类到边界框的数量及其位置,我不知道为什么。我似乎正确地遵循了所有步骤,但不明白为什么它会失败得如此严重。
我指的是Ultralytics给出的this代码。
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/kernels/register.h"
#include <iostream>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/opencv.hpp>
#include <random>
#define NUM_CLASSES_COCO 80
// Convert to letterbox format (copied from official implementation)
cv::Mat formatToSquare(const cv::Mat &source)
{
int col = source.cols;
int row = source.rows;
int _max = MAX(col, row);
cv::Mat result = cv::Mat::zeros(_max, _max, CV_8UC3);
source.copyTo(result(cv::Rect(0, 0, col, row)));
return result;
}
// Preprocessing function (adapted from official)
cv::Mat preprocess(cv::Mat &inputImage, int model_input_height, int model_input_width) {
cv::Mat modelInputImage = formatToSquare(inputImage);
cv::Mat blob;
//model height and width are same so doesn't matter in this case
cv::dnn::blobFromImage(modelInputImage, blob, 1.0 / 255.0, cv::Size(model_input_height, model_input_width), cv::Scalar(), true, false);
return blob;
}
int main()
{
cv::Mat inputImage = //Assume loaded correctly;
float modelConfidenceThreshold = 0.25;
float modelScoreThreshold = 0.45;
float modelNMSThreshold = 0.5;
// path to a yolov8n tflite file trained on the coco dataset
std::unique_ptr<tflite::FlatBufferModel> model = tflite::FlatBufferModel::BuildFromFile(model_path.c_str());
tflite::ops::builtin::BuiltinOpResolver resolver;
std::unique_ptr<tflite::Interpreter> interpreter;
tflite::InterpreterBuilder(*model, resolver)(&interpreter);
// Allocate tensors
if (interpreter->AllocateTensors() != kTfLiteOk)
{
std::cerr << "Failed to allocate tensors." << std::endl;
return;
}
int input = interpreter->inputs()[0];
TfLiteIntArray *dims = interpreter->tensor(input)->dims;
cv::Mat preprocessed_image_blob = preprocess(inputImage, model_input_width, model_input_height);
float *input_data = interpreter->typed_tensor<float>(input);
std::memcpy(input_data, preprocessed_image_blob.data, sizeof(float) * model_input_width * model_input_height * model_input_channels);
// Run inference
if (interpreter->Invoke() != kTfLiteOk)
{
std::cerr << "Failed to invoke tflite interpreter." << std::endl;
return;
}
TfLiteTensor *output_tensor = interpreter->tensor(interpreter->outputs()[0]); //dims are (1, 84, 8400) as expected
int rows, dimensions;
rows = output_shape->data[2];
dimensions = output_shape->data[1];
float* data = output_tensor->data.f;
// This is post processing part which is rather tricky.
// Following the official implementation to reshape the initial
// tensor from (1, 84, 8400) to (8400, 84) and then proceed to
// flatten it.
cv::Mat temp(dimensions, rows, CV_32F, data); // need this to transpose to shape (8400, 84)
cv::transpose(temp, temp);
float* new_data = (float*) temp.data;
float x_factor = image.cols/model_input_width;
float y_factor = image.rows/model_input_height;
std::vector<int> class_ids;
std::vector<float> confidences;
std::vector<cv::Rect> boxes;
for (int i = 0; i < rows; i++) {
float *classes_scores = new_data + 4;
cv::Mat scores(1, NUM_CLASSES_COCO, CV_32FC1, classes_scores);
cv::Point class_id;
double maxClassScore;
cv::minMaxLoc(scores, 0, &maxClassScore, 0, &class_id);
if (maxClassScore > modelScoreThreshold)
{
confidences.push_back(maxClassScore);
class_ids.push_back(class_id.x);
float x = new_data[0];
float y = new_data[1];
float w = new_data[2];
float h = new_data[3];
int left = int((x - 0.5 * w) * x_factor);
int top = int((y - 0.5 * h) * y_factor);
int width = int(w * x_factor);
int height = int(h * y_factor);
boxes.push_back(cv::Rect(left, top, width, height));
}
new_data += dimensions;
}
// Perform NMS over the bounding boxes
std::vector<int> nms_result;
cv::dnn::NMSBoxes(boxes, confidences, modelScoreThreshold, modelNMSThreshold, nms_result);
std::cout << "after nms: " << nms_result.size() << std::endl;
std::vector<Detection> detections{};
for (unsigned long i = 0; i < nms_result.size(); ++i)
{
int idx = nms_result[i];
Detection result;
result.class_id = class_ids[idx];
result.confidence = confidences[idx];
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(100, 255);
result.color = cv::Scalar(dis(gen),
dis(gen),
dis(gen));
result.box = boxes[idx];
detections.push_back(result);
}
}
因此,当我检查检测向量中的值时,我发现我的类预测以及检测向量的长度都是错误的(无论它是什么图像,它的大小始终为 1)。预处理和后处理步骤都是从他们的官方文档中复制的(只是我必须针对 tflite 进行调整),所以我不确定该部分是否有任何问题。 tflite 文件也已被确认通过 python api 正确导出。我什至不知道如何开始调试它。非常感谢任何有关它的提示。
编辑:经过大量调试后,我总是缩小问题范围,以便输出形状解码。修复该问题后,类预测变得更好(尽管不是很好)。
现在的问题是NMS不工作。我确定其原因是 x、y、w、h 的值范围出于某种原因在 (0, 1) 范围内,而它们应该在 (0, dims_of_image) 范围内。我无法理解模型如何准确地给出这些未经训练的值。
我始终可以确认我的预处理步骤是正确的(从正确的 python 实现进行交叉验证)。我的输出张量形状也正确,我的重塑步骤也准确。只有模型输出的边界框坐标值是错误的。班级预测成绩或多或少还不错。
需要将输入从 [1, 3, 640, 640] 重塑为 [1, 640, 640, 3]。