Libtorch：U-net分割输出正确，但彼此相邻重复3次

Question

我正在尝试在 C++ 裂缝分割应用程序中部署此链接的模型。我按照此链接中的步骤转换 PyTorch 模型并对其进行序列化。这里是跟踪模块的链接。另外，这是示例图像的链接。

这是原始“inference_unet.py”文件中的代码：

import sys
import os
import numpy as np
from pathlib import Path
import cv2 as cv
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.transforms as transforms
from unet.unet_transfer import UNet16, input_size
import matplotlib.pyplot as plt
import argparse
from os.path import join
from PIL import Image
import gc
from utils import load_unet_vgg16, load_unet_resnet_101, load_unet_resnet_34
from tqdm import tqdm

import torch #Hedeya
import torchvision #Hedeya

def evaluate_img(model, img):
    input_width, input_height = input_size[0], input_size[1]
    
    img_1 = cv.resize(img, (input_width, input_height), cv.INTER_AREA)
    print(img_1.shape)
    #X = train_tfms(Image.fromarray(img_1))
    X = train_tfms(img_1)
    print(X.shape)
    X = Variable(X.unsqueeze(0)).cuda()  # [N, 1, H, W]
    print(X.shape)
    
    # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing [Hedeya]
    traced_script_module = torch.jit.trace(model, X) #Hedeya
    traced_script_module.save("traced_unet-vgg16_model.pt") #Hedeya
    
    mask = model(X)
    print(mask.shape)
    #mask = F.sigmoid(mask[0, 0]).data.cpu().numpy()
    print(mask[0,0].shape)
    mask = torch.sigmoid(mask[0, 0]).data.cpu().numpy() #Hedeya
    mask = cv.resize(mask, (img_width, img_height), cv.INTER_AREA)
    return mask

def evaluate_img_patch(model, img):
    input_width, input_height = input_size[0], input_size[1]

    img_height, img_width, img_channels = img.shape

    if img_width < input_width or img_height < input_height:
        return evaluate_img(model, img)

    stride_ratio = 0.1
    stride = int(input_width * stride_ratio)

    normalization_map = np.zeros((img_height, img_width), dtype=np.int16)

    patches = []
    patch_locs = []
    for y in range(0, img_height - input_height + 1, stride):
        for x in range(0, img_width - input_width + 1, stride):
            segment = img[y:y + input_height, x:x + input_width]
            normalization_map[y:y + input_height, x:x + input_width] += 1
            patches.append(segment)
            patch_locs.append((x, y))

    patches = np.array(patches)
    if len(patch_locs) <= 0:
        return None

    preds = []
    for i, patch in enumerate(patches):
        patch_n = train_tfms(Image.fromarray(patch))
        X = Variable(patch_n.unsqueeze(0)).cuda()  # [N, 1, H, W]
        masks_pred = model(X)
        #mask = F.sigmoid(masks_pred[0, 0]).data.cpu().numpy()
        mask = torch.sigmoid(masks_pred[0, 0]).data.cpu().numpy() #Hedeya
        preds.append(mask)

    probability_map = np.zeros((img_height, img_width), dtype=float)
    for i, response in enumerate(preds):
        coords = patch_locs[i]
        probability_map[coords[1]:coords[1] + input_height, coords[0]:coords[0] + input_width] += response

    return probability_map

def disable_axis():
    plt.axis('off')
    plt.gca().axes.get_xaxis().set_visible(False)
    plt.gca().axes.get_yaxis().set_visible(False)
    plt.gca().axes.get_xaxis().set_ticklabels([])
    plt.gca().axes.get_yaxis().set_ticklabels([])

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-img_dir',type=str, help='input dataset directory')
    parser.add_argument('-model_path', type=str, help='trained model path')
    parser.add_argument('-model_type', type=str, choices=['vgg16', 'resnet101', 'resnet34'])
    parser.add_argument('-out_viz_dir', type=str, default='', required=False, help='visualization output dir')
    parser.add_argument('-out_pred_dir', type=str, default='', required=False,  help='prediction output dir')
    parser.add_argument('-threshold', type=float, default=0.2 , help='threshold to cut off crack response')
    args = parser.parse_args()

    if args.out_viz_dir != '':
        os.makedirs(args.out_viz_dir, exist_ok=True)
        for path in Path(args.out_viz_dir).glob('*.*'):
            os.remove(str(path))

    if args.out_pred_dir != '':
        os.makedirs(args.out_pred_dir, exist_ok=True)
        for path in Path(args.out_pred_dir).glob('*.*'):
            os.remove(str(path))

    if args.model_type == 'vgg16':
        #model = load_unet_vgg16(args.model_path)
        model = load_unet_vgg16(args.model_path, None) #Hedeya + None I/O False
    elif args.model_type  == 'resnet101':
        model = load_unet_resnet_101(args.model_path)
    elif args.model_type  == 'resnet34':
        model = load_unet_resnet_34(args.model_path)
        print(model)
    else:
        print('undefind model name pattern')
        exit()

    channel_means = [0.485, 0.456, 0.406]
    channel_stds  = [0.229, 0.224, 0.225]

    paths = [path for path in Path(args.img_dir).glob('*.*')]
    for path in tqdm(paths):
        #print(str(path))

        train_tfms = transforms.Compose([transforms.ToTensor(), transforms.Normalize(channel_means, channel_stds)])

        img_0 = Image.open(str(path))
        img_0 = np.asarray(img_0)
        if len(img_0.shape) != 3:
            print(f'incorrect image shape: {path.name}{img_0.shape}')
            continue

        img_0 = img_0[:,:,:3]

        img_height, img_width, img_channels = img_0.shape

        prob_map_full = evaluate_img(model, img_0)

        if args.out_pred_dir != '':
            cv.imwrite(filename=join(args.out_pred_dir, f'{path.stem}.jpg'), img=(prob_map_full * 255).astype(np.uint8))

        if args.out_viz_dir != '':
            # plt.subplot(121)
            # plt.imshow(img_0), plt.title(f'{img_0.shape}')
            if img_0.shape[0] > 2000 or img_0.shape[1] > 2000:
                img_1 = cv.resize(img_0, None, fx=0.2, fy=0.2, interpolation=cv.INTER_AREA)
            else:
                img_1 = img_0

            # plt.subplot(122)
            # plt.imshow(img_0), plt.title(f'{img_0.shape}')
            # plt.show()

            prob_map_patch = evaluate_img_patch(model, img_1)

            #plt.title(f'name={path.stem}. \n cut-off threshold = {args.threshold}', fontsize=4)
            prob_map_viz_patch = prob_map_patch.copy()
            prob_map_viz_patch = prob_map_viz_patch/ prob_map_viz_patch.max()
            prob_map_viz_patch[prob_map_viz_patch < args.threshold] = 0.0
            fig = plt.figure()
            st = fig.suptitle(f'name={path.stem} \n cut-off threshold = {args.threshold}', fontsize="x-large")
            ax = fig.add_subplot(231)
            ax.imshow(img_1)
            ax = fig.add_subplot(232)
            ax.imshow(prob_map_viz_patch)
            ax = fig.add_subplot(233)
            ax.imshow(img_1)
            ax.imshow(prob_map_viz_patch, alpha=0.4)

            prob_map_viz_full = prob_map_full.copy()
            prob_map_viz_full[prob_map_viz_full < args.threshold] = 0.0

            ax = fig.add_subplot(234)
            ax.imshow(img_0)
            ax = fig.add_subplot(235)
            ax.imshow(prob_map_viz_full)
            ax = fig.add_subplot(236)
            ax.imshow(img_0)
            ax.imshow(prob_map_viz_full, alpha=0.4)

            plt.savefig(join(args.out_viz_dir, f'{path.stem}.jpg'), dpi=500)
            plt.close('all')

        gc.collect()

以下是我使用libtorch部署模型的C++代码：

#include <torch/torch.h>
#include <iostream>

#include <torch/script.h>

#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
//#include <opencv2/dnn.hpp>

std::string get_image_type(const cv::Mat& img, bool more_info = true)
{
    std::string r;
    int type = img.type();
    uchar depth = type & CV_MAT_DEPTH_MASK;
    uchar chans = 1 + (type >> CV_CN_SHIFT);

    switch (depth) {
    case CV_8U:  r = "8U"; break;
    case CV_8S:  r = "8S"; break;
    case CV_16U: r = "16U"; break;
    case CV_16S: r = "16S"; break;
    case CV_32S: r = "32S"; break;
    case CV_32F: r = "32F"; break;
    case CV_64F: r = "64F"; break;
    default:     r = "User"; break;
    }

    r += "C";
    r += (chans + '0');

    if (more_info)
        std::cout << "depth: " << img.depth() << " channels: " << img.channels() << std::endl;

    return r;
}

void show_image(cv::Mat& img, std::string title)
{
    std::string image_type = get_image_type(img);
    cv::namedWindow(title + " type:" + image_type, cv::WINDOW_NORMAL); // Create a window for display.
    cv::imshow(title, img);
    cv::waitKey(0);
}

auto transpose(torch::Tensor tensor, c10::IntArrayRef dims = { 0, 3, 1, 2 })
{
    std::cout << "############### transpose ############" << std::endl;
    std::cout << "shape before : " << tensor.sizes() << std::endl;
    tensor = tensor.permute(dims);
    std::cout << "shape after : " << tensor.sizes() << std::endl;
    std::cout << "######################################" << std::endl;
    return tensor;
}

auto ToTensor(cv::Mat img, bool show_output = false, bool unsqueeze = false, int unsqueeze_dim = 0)
{
    std::cout << "image shape: " << img.size() << std::endl;
    torch::Tensor tensor_image = torch::from_blob(img.data, { img.rows, img.cols, 3 }, torch::kByte);

    if (unsqueeze)
    {
        tensor_image.unsqueeze_(unsqueeze_dim);
        std::cout << "tensors new shape: " << tensor_image.sizes() << std::endl;
    }

    if (show_output)
    {
        std::cout << tensor_image.slice(2, 0, 1) << std::endl;
    }
    std::cout << "tenor shape: " << tensor_image.sizes() << std::endl;
    return tensor_image;
}

auto ToInput(torch::Tensor tensor_image)
{
    // Create a vector of inputs.
    return std::vector<torch::jit::IValue>{tensor_image};
}

auto ToCvImage(torch::Tensor tensor)
{
    int width = tensor.sizes()[0];
    int height = tensor.sizes()[1];
    try
    {
        cv::Mat output_mat(cv::Size{ height, width }, CV_8UC3, tensor.data_ptr<uchar>());

        show_image(output_mat, "converted image from tensor");
        return output_mat.clone();
    }
    catch (const c10::Error& e)
    {
        std::cout << "an error has occured : " << e.msg() << std::endl;
    }
    return cv::Mat(height, width, CV_8UC3);
}

int main() {
  cv::Mat img = cv::imread("D:/Post_Grad/STDF/crack_segmentation-master_original/test_images_mine/00526.jpg");
  cv::Mat img_1;
  cv::resize(img, img_1, cv::Size(448, 448), 0, 0, cv::INTER_AREA);
  
  show_image(img_1, "Test Image");

  // convert the cvimage into tensor
  auto tensor = ToTensor(img_1);

  std::cout << "To Tensor: " << tensor.sizes() << std::endl;

  auto cv_img = ToCvImage(tensor);
  show_image(cv_img, "converted image from tensor");
  
  // swap axis 
  tensor = transpose(tensor, { (2),(0),(1) });

  std::cout << "transpose: " << tensor.sizes() << std::endl;
  
  // convert the tensor into float and scale it 
  tensor = tensor.toType(c10::kFloat).div(255);

  //normalize
  tensor[0] = tensor[0].sub_(0.485).div_(0.229);
  tensor[1] = tensor[1].sub_(0.456).div_(0.224);
  tensor[2] = tensor[2].sub_(0.406).div_(0.225);

  //add batch dim (an inplace operation just like in pytorch)
  tensor.unsqueeze_(0);

  tensor = tensor.to(torch::kCUDA);

  std::cout << "unsqueeze: " << tensor.sizes() << std::endl;

  auto input_to_net = ToInput(tensor);

  torch::jit::script::Module module;

  try
  {
      // Deserialize the ScriptModule from a file using torch::jit::load().
      module = torch::jit::load("D:/Post_Grad/STDF/crack_segmentation-master_original/traced_unet-vgg16_model.pt");

      // Execute the model and turn its output into a tensor.
      torch::Tensor output = module.forward(input_to_net).toTensor();

      //sizes() gives shape. 
      std::cout << output.sizes() << std::endl;
      //std::cout << "output: " << output[0] << std::endl;
      //std::cout << output.slice(/*dim=*/1, /*start=*/0, /*end=*/5) << '\n';
      output = torch::sigmoid(output);
      
      auto out_tensor = output.squeeze(0).detach().permute({ 1, 2, 0 });
      //auto out_tensor = output.squeeze().detach();
      std::cout << "out_tensor (after squeeze & detach): " << out_tensor.sizes() << std::endl;
      out_tensor = out_tensor.mul(255).clamp(0, 255).to(torch::kU8);
      out_tensor = out_tensor.to(torch::kCPU);
      cv::Mat resultImg(448, 448, CV_8UC3);
      std::memcpy((void*)resultImg.data, out_tensor.data_ptr(), sizeof(torch::kU8) * out_tensor.numel());
      cv::resize(resultImg, resultImg, cv::Size(1280, 720), 0, 0, cv::INTER_AREA);

      cv::imwrite("D:/Post_Grad/STDF/crack_segmentation-master_original/test_images_mine/00526-seg-2.jpg", resultImg);
  }
  catch (const c10::Error& e)
  {
      std::cerr << "error loading the model\n" << e.msg();
      std::system("pause");

      return -1;
  }

  std::cout << "ok\n";
  std::system("pause");
  return 0;

  //std::cin.get();
}

原始PyTorch模型的输出如下：

libtorch代码的输出如下：

它看起来与 PyTorch 的输出类似，但它彼此相邻重复了 3 次。

我没能发现上面C++中这个错误的原因。请帮忙检查和建议。

Answer 1

问题就在这里：

cv::Mat resultImg(448, 448, CV_8UC3);

这声明了一个具有 3 个颜色通道的矩阵，但您正在保存具有单个通道的图像。

改成这样：

cv::Mat resultImg(448, 448, CV_8U);

Answer 2

我使用 Unet 分割模型没有得到正确的结果，我使用 onnx 文件得到了正确的结果，但是当尝试在 C++ 中复制相同的结果时，我没有得到正确的输出。这是我用 python 代码得到的所需输出：python 输出但使用 C++ 我得到这个结果 c++ 输出谁能帮我看看代码有什么问题吗：

#include <opencv2/opencv.hpp>
#include <onnxruntime_cxx_api.h>
#include <iostream>
#include <vector>
#include <stdexcept>
#include <memory>
#include <cassert>

// Function to load and preprocess the image
cv::Mat preprocess_image(const std::string& image_path, cv::Size target_size) {
    cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
    if (image.empty()) {
        throw std::runtime_error("Error loading image: " + image_path);
    }
    cv::resize(image, image, target_size);
    image.convertTo(image, CV_32FC3, 1.0 / 255.0);  // Normalize to [0, 1]

    // Convert from HWC to CHW
    cv::Mat chw_image(target_size.height * 3, target_size.width, CV_32FC1);
    std::vector<cv::Mat> channels(3);
    cv::split(image, channels);
    for (int i = 0; i < 3; ++i) {
        channels[i].copyTo(chw_image(cv::Rect(0, i * target_size.height, target_size.width, target_size.height)));
    }
    return chw_image;
}

int main() {
    try {
        // Initialize ONNX Runtime
        Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "ONNXModel");

        // Create ONNX Runtime session options
        Ort::SessionOptions session_options;
        session_options.SetIntraOpNumThreads(1);
        session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);

        // Load the ONNX model
        std::string modelPath = "D:/cms_trucks/Pytorch-UNet-master/unet_cms_trucks5.onnx";
#ifdef _WIN32
        std::wstring modelPathW(modelPath.begin(), modelPath.end());
        Ort::Session session(env, modelPathW.c_str(), session_options);
#else
        Ort::Session session(env, modelPath.c_str(), session_options);
#endif

        // Load and preprocess the images
        cv::Mat input_image_1 = preprocess_image("D:/cms_trucks/videos/image1.JPG", cv::Size(616, 589));
        cv::Mat input_image_2 = preprocess_image("D:/cms_trucks/videos/image2.JPG", cv::Size(616, 589));

        // Combine images into one tensor
        std::vector<cv::Mat> input_images = { input_image_1, input_image_2 };
        std::vector<float> input_tensor_values(2 * 3 * 589 * 616);  // 2 images, 3 channels, 589x616 resolution

        // Fill tensor values from input_images
        int index = 0;
        for (const auto& img : input_images) {
            std::vector<cv::Mat> channels(3);
            cv::split(img, channels);
            for (const auto& ch : channels) {
                std::memcpy(input_tensor_values.data() + index, ch.data, ch.total() * sizeof(float));
                index += ch.total();
            }
        }

        // Create ONNX Runtime tensor object
        Ort::AllocatorWithDefaultOptions allocator;
        std::vector<int64_t> input_tensor_shape = { 2, 3, 589, 616 };  // (N, C, H, W)
        Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
        Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
            memory_info,
            input_tensor_values.data(),
            input_tensor_values.size(),
            input_tensor_shape.data(),
            input_tensor_shape.size()
        );

        // Run inference
        const char* input_names[] = { "input" };
        const char* output_names[] = { "output" };
        std::vector<Ort::Value> outputs = session.Run(
            Ort::RunOptions{ nullptr },
            input_names, &input_tensor, 1,
            output_names, 1
        );

        // Get output tensor
        Ort::Value& output_tensor = outputs.front();
        auto output_tensor_info = output_tensor.GetTensorTypeAndShapeInfo();
        std::vector<int64_t> output_tensor_shape = output_tensor_info.GetShape();
        float* output_data = output_tensor.GetTensorMutableData<float>();

        // Display output details
        std::cout << "Output Tensor Shape: ";
        for (auto dim : output_tensor_shape) {
            std::cout << dim << " ";
        }
        std::cout << std::endl;

        // Reshape output to match (2, 2, 589, 616) and display images
        int batch_size = output_tensor_shape[0];
        int num_channels = output_tensor_shape[1];
        int height = output_tensor_shape[2];
        int width = output_tensor_shape[3];

        for (int i = 0; i < batch_size; ++i) {
            std::vector<cv::Mat> channels(num_channels);
            for (int c = 0; c < num_channels; ++c) {
                channels[c] = cv::Mat(height, width, CV_32FC1, output_data + i * num_channels * height * width + c * height * width);
            }

            cv::Mat output_image;

            // Handle 2-channel images by adding a dummy channel
            if (num_channels == 2) {
                cv::Mat dummy_channel = cv::Mat::zeros(height, width, CV_32FC1);
                channels.push_back(dummy_channel);
                cv::merge(channels, output_image);
                output_image.convertTo(output_image, CV_8U, 255.0);
            }
            // Handle 3-channel images
            else if (num_channels == 3) {
                cv::merge(channels, output_image);
                output_image.convertTo(output_image, CV_8UC3, 255.0);
            }
            // Handle unexpected number of channels
            else {
                std::cerr << "Unexpected number of channels: " << num_channels << std::endl;
                continue;
            }

            // Display the image
            cv::imshow("Output Image " + std::to_string(i), output_image);
        }

        cv::waitKey(0);
        cv::destroyAllWindows();
    }
    catch (const cv::Exception& ex) {
        std::cerr << "OpenCV error: " << ex.what() << std::endl;
        return -1;
    }
    catch (const Ort::Exception& ex) {
        std::cerr << "ONNX Runtime error: " << ex.what() << std::endl;
        return -1;
    }
    catch (const std::exception& ex) {
        std::cerr << "Standard error: " << ex.what() << std::endl;
        return -1;
    }
    return 0;
}

Libtorch：U-net分割输出正确，但彼此相邻重复3次

问题描述投票：0回答：2

2个回答

最新问题

Libtorch：U-net分割输出正确，但彼此相邻重复3次

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2