Yolo 为每个输入输出完全相同的边界框

Question

我试图在 Pascal VOC 上训练一个 Yolo 模型，在大约 120 个 epoch 之后，当我尝试测试它时，我意识到它为每个图像输出完全相同的边界框，甚至只是随机噪声。

这是模型：

class ConvLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=None, bn=True, alpha=0.1):
        super().__init__()
        if padding == None:
            padding = kernel_size // 2

        bn = nn.Identity()
        if bn == True:
            bn = nn.BatchNorm2d(out_channels)

        self.layer = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
            bn,
            nn.LeakyReLU(alpha)
        )

    def forward(self, x):
        return self.layer(x)

class YOLOv1(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super().__init__()

        self.S = S
        self.B = B
        self.C = C
        # ConvNet
        self.conv1 = ConvLayer(3, 64, 7, stride=2)
        self.mp1 = nn.MaxPool2d(2)

        self.conv2 = ConvLayer(64, 192, 3)
        self.mp2 = nn.MaxPool2d(2)

        self.conv3 = nn.Sequential(
            ConvLayer(192, 128, 1),
            ConvLayer(128, 256, 3),
            ConvLayer(256, 256, 1),
            ConvLayer(256, 512, 3)
        )
        self.mp3 = nn.MaxPool2d(2)

        conv4_block = []
        for _ in range(4):
            conv4_block.append(ConvLayer(512, 256, 1))
            conv4_block.append(ConvLayer(256, 512, 3))

        conv4_block.append(ConvLayer(512, 512, 1))
        conv4_block.append(ConvLayer(512, 1024, 3))

        self.conv4 = nn.Sequential(*conv4_block)
        self.mp4 = nn.MaxPool2d(2)

        self.conv5 = nn.Sequential(
            ConvLayer(1024, 512, 1),
            ConvLayer(512, 1024, 3),
            ConvLayer(1024, 512, 1),
            ConvLayer(512, 1024, 3),
            ConvLayer(1024, 1024, 3),
            ConvLayer(1024, 1024, 3, stride=2)
        )

        self.conv6 = nn.Sequential(
            ConvLayer(1024, 1024, 3),
            ConvLayer(1024, 1024, 3)
        )

        # FC

        self.ffn = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * 7 * 7, 4096),
            nn.BatchNorm1d(4096),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (B * 5 + C)),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.mp1(self.conv1(x))
        x = self.mp2(self.conv2(x))
        x = self.mp3(self.conv3(x))
        x = self.mp4(self.conv4(x))
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.ffn(x)
        x = x.view(-1, self.S, self.S, (self.B * 5 + self.C))
        return x

这是损失：

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.ops.boxes as bops
from time import perf_counter as time

def iou(bbox1, bbox2):
    bbox1 = bbox1.clone().detach()
    bbox2 = bbox2.clone().detach()
    bbox1 += 0.000000001
    bbox2 -= 0.000000001
    bb1w = bbox1[:, 2].clone()
    bb1h = bbox1[:, 3].clone()
    bb2w = bbox2[:, 2].clone()
    bb2h = bbox2[:, 3].clone()
    bbox1[:, 2] = bbox1[:, 0] + bb1w / 2
    bbox1[:, 0] = bbox1[:, 0] - bb1w / 2
    bbox1[:, 3] = bbox1[:, 1] + bb1h / 2
    bbox1[:, 1] = bbox1[:, 1] - bb1h / 2

    bbox2[:, 2] = bbox2[:, 0] + bb2w / 2
    bbox2[:, 0] = bbox2[:, 0] - bb2w / 2
    bbox2[:, 3] = bbox2[:, 1] + bb2h / 2
    bbox2[:, 1] = bbox2[:, 1] - bb2h / 2
    #convert to [x1, y1, x2, y2]
    out = torch.diagonal(bops.box_iou(bbox1, bbox2)).unsqueeze(0)
    #out = out.nan_to_num(1)
    return out


class YoloLoss(nn.Module):
    def __init__(self, lambda_coord=5, lambda_noobj=0.5, S=7, B=2, C=20):
        super().__init__()
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj
        self.S = S
        self.B = B
        self.C = C
        self.mse = nn.MSELoss(reduction='sum')

    def forward(self, predictions, labels):
        ''' 
        ONLY WORKS WITH 2 BOUNDING BOXES FOR NOW WILL CHANGE LATER
        # predictions of shape (B, S, S, (B * 5 + C))
        # Last dim is [x, y, w, h, Pr(c), x2, y2, w2, h2, Pr(c2), Pr(C1), Pr(C2) ... Pr(Cn)]
        '''

        object_inds = labels[..., 4] == 1

        obj_preds = predictions[object_inds]
        obj_labels = labels[object_inds]

        noobj_preds = predictions[~object_inds]
        noobj_labels = predictions[~object_inds]

        # Coordinate losses precalculation

        box1_pred = obj_preds[..., 0:5]
        box2_pred = obj_preds[..., 5:10]
        box_target = obj_labels[..., 0:5]

        ioub1 = iou(box1_pred[:, :4], box_target[:, :4]) # Index to not include class
        ioub2 = iou(box2_pred[:, :4], box_target[:, :4])

        best_boxes = torch.where((ioub1 > ioub2).reshape(-1, 1), box1_pred, box2_pred)

        # Loss for bounding box centers in cells with objects

        bbox_center_loss = self.mse(best_boxes[:, :2], box_target[:, :2])

        # Loss for bounding box dimensions in cells with objects

        bbox_dim_loss = self.mse(torch.sqrt(best_boxes[:, 2:4]), torch.sqrt(box_target[:, 2:4]))

        # Loss for confidence in cells with objects

        obj_conf_loss = self.mse(best_boxes[:, 4], box_target[:, 4])

        # Loss for confidence in cells without objects

        noobj_pred_conf = noobj_preds[:, [4, 9]] # will change to torch.arange(4, 5 * self.B, 5) later
        noobj_label_conf = noobj_labels[:, [4, 4]] # just [4] will work but pytorch gives broadcasting warning so whatever

        noobj_conf_loss = self.mse(noobj_pred_conf, noobj_label_conf)

        # Loss for classification in cells with objects

        classification_loss = self.mse(obj_preds[:, 10:], obj_labels[:, 10:])

        # FINAL LOSS

        coord_loss = self.lambda_coord * (bbox_center_loss + bbox_dim_loss)
        confidence_loss = obj_conf_loss + self.lambda_noobj * noobj_conf_loss

        loss = (coord_loss + confidence_loss + classification_loss) / predictions.shape[0]

        return loss

在 Kaggle 上训练之后，当我尝试像这样加载和测试模型时：

epochs = 10
device = torch.device("mps")
batch_size = 1
classes = ['horse', 'person', 'bottle', 'dog', 'tvmonitor', 'car', 'aeroplane', 'bicycle', 'boat', 'chair', 'diningtable', 'pottedplant', 'train', 'cat', 'sofa', 'bird', 'sheep', 'motorbike', 'bus', 'cow']
dataloader = get_dataset(batch_size)


loss_func = YoloLoss()
model = YOLOv1()
model = nn.DataParallel(model)
cp = torch.load("data/model.pt", map_location=torch.device('cpu'))
model.load_state_dict(cp)
model.eval()
lr = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.00005, nesterov=True)


i, o = next(iter(dataloader))
i2, o2 = next(iter(dataloader))
l1 = model(i)
l2 = model(i2)
l1 = l1[0]
l2 = l2[0]
l1 = l1[l1[:, :, 4] > 0.99] # :, :, 4 is object confidence for each cell
l2 = l2[l2[:, :, 4] > 0.99]
print(l1)
print(l2)

那些返回几乎完全相同的东西：

tensor([[0.7578, 0.5948, 0.4245, 0.5536, 0.9923, 0.3042, 0.4912, 0.2626, 0.3866,
         0.9923, 0.0386, 0.3109, 0.0398, 0.0613, 0.0330, 0.0379, 0.0325, 0.0272,
         0.0383, 0.0519, 0.0260, 0.0312, 0.0401, 0.0538, 0.0309, 0.0521, 0.0384,
         0.0329, 0.0340, 0.0368],
        [0.7746, 0.3583, 0.3543, 0.4720, 0.9911, 0.3432, 0.4972, 0.2896, 0.4189,
         0.9912, 0.0489, 0.3794, 0.0399, 0.0676, 0.0323, 0.0391, 0.0310, 0.0372,
         0.0384, 0.0603, 0.0328, 0.0350, 0.0359, 0.0419, 0.0374, 0.0477, 0.0429,
         0.0350, 0.0318, 0.0371],
        [0.4770, 0.6212, 0.6538, 0.6942, 0.9960, 0.5357, 0.2490, 0.4338, 0.4130,
         0.9947, 0.0262, 0.1659, 0.0200, 0.0818, 0.0192, 0.0515, 0.0673, 0.0292,
         0.0252, 0.0268, 0.0164, 0.0196, 0.0305, 0.0875, 0.0246, 0.0428, 0.0231,
         0.0342, 0.0339, 0.0221],
        [0.4482, 0.2498, 0.5434, 0.5681, 0.9936, 0.5363, 0.6450, 0.3235, 0.3887,
         0.9932, 0.0366, 0.2673, 0.0321, 0.0518, 0.0249, 0.0485, 0.0317, 0.0305,
         0.0306, 0.0411, 0.0354, 0.0250, 0.0310, 0.0429, 0.0506, 0.0445, 0.0311,
         0.0358, 0.0272, 0.0313],
        [0.2482, 0.6096, 0.4617, 0.5805, 0.9923, 0.6826, 0.5125, 0.2795, 0.4089,
         0.9922, 0.0402, 0.3383, 0.0374, 0.0656, 0.0312, 0.0423, 0.0328, 0.0280,
         0.0360, 0.0415, 0.0262, 0.0321, 0.0410, 0.0552, 0.0310, 0.0507, 0.0377,
         0.0338, 0.0321, 0.0328],
        [0.2101, 0.3151, 0.3860, 0.5073, 0.9912, 0.6172, 0.5605, 0.2832, 0.3956,
         0.9913, 0.0448, 0.3436, 0.0400, 0.0695, 0.0332, 0.0468, 0.0330, 0.0358,
         0.0352, 0.0812, 0.0325, 0.0345, 0.0323, 0.0488, 0.0380, 0.0403, 0.0478,
         0.0377, 0.0313, 0.0354]], grad_fn=<IndexBackward0>)
tensor([[0.7578, 0.5948, 0.4245, 0.5536, 0.9923, 0.3042, 0.4912, 0.2626, 0.3866,
         0.9923, 0.0386, 0.3109, 0.0398, 0.0613, 0.0330, 0.0379, 0.0325, 0.0272,
         0.0383, 0.0519, 0.0260, 0.0312, 0.0401, 0.0538, 0.0309, 0.0521, 0.0384,
         0.0329, 0.0340, 0.0368],
        [0.7746, 0.3583, 0.3543, 0.4720, 0.9911, 0.3432, 0.4972, 0.2896, 0.4189,
         0.9912, 0.0489, 0.3794, 0.0399, 0.0676, 0.0323, 0.0391, 0.0310, 0.0372,
         0.0384, 0.0603, 0.0328, 0.0350, 0.0359, 0.0419, 0.0374, 0.0477, 0.0429,
         0.0350, 0.0318, 0.0371],
        [0.4770, 0.6212, 0.6538, 0.6942, 0.9960, 0.5357, 0.2490, 0.4338, 0.4130,
         0.9947, 0.0262, 0.1659, 0.0200, 0.0818, 0.0192, 0.0515, 0.0673, 0.0292,
         0.0252, 0.0268, 0.0164, 0.0196, 0.0305, 0.0875, 0.0246, 0.0428, 0.0231,
         0.0342, 0.0339, 0.0221],
        [0.4482, 0.2498, 0.5434, 0.5681, 0.9936, 0.5363, 0.6450, 0.3235, 0.3887,
         0.9932, 0.0366, 0.2673, 0.0321, 0.0518, 0.0249, 0.0485, 0.0317, 0.0305,
         0.0306, 0.0411, 0.0354, 0.0250, 0.0310, 0.0429, 0.0506, 0.0445, 0.0311,
         0.0358, 0.0272, 0.0313],
        [0.2482, 0.6096, 0.4617, 0.5805, 0.9923, 0.6826, 0.5125, 0.2795, 0.4089,
         0.9922, 0.0402, 0.3383, 0.0374, 0.0656, 0.0312, 0.0423, 0.0328, 0.0280,
         0.0360, 0.0415, 0.0262, 0.0321, 0.0410, 0.0552, 0.0310, 0.0507, 0.0377,
         0.0338, 0.0321, 0.0328],
        [0.2101, 0.3151, 0.3860, 0.5073, 0.9912, 0.6172, 0.5605, 0.2832, 0.3956,
         0.9913, 0.0448, 0.3436, 0.0400, 0.0695, 0.0332, 0.0468, 0.0330, 0.0358,
         0.0352, 0.0812, 0.0325, 0.0345, 0.0323, 0.0488, 0.0380, 0.0403, 0.0478,
         0.0377, 0.0313, 0.0354]], grad_fn=<IndexBackward0>)

甚至随机噪声也是一样的：

我知道我不是每次都在模型中意外输入相同的图像，因为我手动检查过。我觉得可能出了什么问题，即使训练完全失败，也不应该每次都输出几乎完全相同的东西。

Yolo 为每个输入输出完全相同的边界框

问题描述投票：0回答：0

最新问题

Yolo 为每个输入输出完全相同的边界框

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0