我试图在 Pascal VOC 上训练一个 Yolo 模型,在大约 120 个 epoch 之后,当我尝试测试它时,我意识到它为每个图像输出完全相同的边界框,甚至只是随机噪声。
这是模型:
class ConvLayer(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=None, bn=True, alpha=0.1):
super().__init__()
if padding == None:
padding = kernel_size // 2
bn = nn.Identity()
if bn == True:
bn = nn.BatchNorm2d(out_channels)
self.layer = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
bn,
nn.LeakyReLU(alpha)
)
def forward(self, x):
return self.layer(x)
class YOLOv1(nn.Module):
def __init__(self, S=7, B=2, C=20):
super().__init__()
self.S = S
self.B = B
self.C = C
# ConvNet
self.conv1 = ConvLayer(3, 64, 7, stride=2)
self.mp1 = nn.MaxPool2d(2)
self.conv2 = ConvLayer(64, 192, 3)
self.mp2 = nn.MaxPool2d(2)
self.conv3 = nn.Sequential(
ConvLayer(192, 128, 1),
ConvLayer(128, 256, 3),
ConvLayer(256, 256, 1),
ConvLayer(256, 512, 3)
)
self.mp3 = nn.MaxPool2d(2)
conv4_block = []
for _ in range(4):
conv4_block.append(ConvLayer(512, 256, 1))
conv4_block.append(ConvLayer(256, 512, 3))
conv4_block.append(ConvLayer(512, 512, 1))
conv4_block.append(ConvLayer(512, 1024, 3))
self.conv4 = nn.Sequential(*conv4_block)
self.mp4 = nn.MaxPool2d(2)
self.conv5 = nn.Sequential(
ConvLayer(1024, 512, 1),
ConvLayer(512, 1024, 3),
ConvLayer(1024, 512, 1),
ConvLayer(512, 1024, 3),
ConvLayer(1024, 1024, 3),
ConvLayer(1024, 1024, 3, stride=2)
)
self.conv6 = nn.Sequential(
ConvLayer(1024, 1024, 3),
ConvLayer(1024, 1024, 3)
)
# FC
self.ffn = nn.Sequential(
nn.Flatten(),
nn.Linear(1024 * 7 * 7, 4096),
nn.BatchNorm1d(4096),
nn.LeakyReLU(0.1),
nn.Linear(4096, S * S * (B * 5 + C)),
nn.Sigmoid(),
)
def forward(self, x):
x = self.mp1(self.conv1(x))
x = self.mp2(self.conv2(x))
x = self.mp3(self.conv3(x))
x = self.mp4(self.conv4(x))
x = self.conv5(x)
x = self.conv6(x)
x = self.ffn(x)
x = x.view(-1, self.S, self.S, (self.B * 5 + self.C))
return x
这是损失:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.ops.boxes as bops
from time import perf_counter as time
def iou(bbox1, bbox2):
bbox1 = bbox1.clone().detach()
bbox2 = bbox2.clone().detach()
bbox1 += 0.000000001
bbox2 -= 0.000000001
bb1w = bbox1[:, 2].clone()
bb1h = bbox1[:, 3].clone()
bb2w = bbox2[:, 2].clone()
bb2h = bbox2[:, 3].clone()
bbox1[:, 2] = bbox1[:, 0] + bb1w / 2
bbox1[:, 0] = bbox1[:, 0] - bb1w / 2
bbox1[:, 3] = bbox1[:, 1] + bb1h / 2
bbox1[:, 1] = bbox1[:, 1] - bb1h / 2
bbox2[:, 2] = bbox2[:, 0] + bb2w / 2
bbox2[:, 0] = bbox2[:, 0] - bb2w / 2
bbox2[:, 3] = bbox2[:, 1] + bb2h / 2
bbox2[:, 1] = bbox2[:, 1] - bb2h / 2
#convert to [x1, y1, x2, y2]
out = torch.diagonal(bops.box_iou(bbox1, bbox2)).unsqueeze(0)
#out = out.nan_to_num(1)
return out
class YoloLoss(nn.Module):
def __init__(self, lambda_coord=5, lambda_noobj=0.5, S=7, B=2, C=20):
super().__init__()
self.lambda_coord = lambda_coord
self.lambda_noobj = lambda_noobj
self.S = S
self.B = B
self.C = C
self.mse = nn.MSELoss(reduction='sum')
def forward(self, predictions, labels):
'''
ONLY WORKS WITH 2 BOUNDING BOXES FOR NOW WILL CHANGE LATER
# predictions of shape (B, S, S, (B * 5 + C))
# Last dim is [x, y, w, h, Pr(c), x2, y2, w2, h2, Pr(c2), Pr(C1), Pr(C2) ... Pr(Cn)]
'''
object_inds = labels[..., 4] == 1
obj_preds = predictions[object_inds]
obj_labels = labels[object_inds]
noobj_preds = predictions[~object_inds]
noobj_labels = predictions[~object_inds]
# Coordinate losses precalculation
box1_pred = obj_preds[..., 0:5]
box2_pred = obj_preds[..., 5:10]
box_target = obj_labels[..., 0:5]
ioub1 = iou(box1_pred[:, :4], box_target[:, :4]) # Index to not include class
ioub2 = iou(box2_pred[:, :4], box_target[:, :4])
best_boxes = torch.where((ioub1 > ioub2).reshape(-1, 1), box1_pred, box2_pred)
# Loss for bounding box centers in cells with objects
bbox_center_loss = self.mse(best_boxes[:, :2], box_target[:, :2])
# Loss for bounding box dimensions in cells with objects
bbox_dim_loss = self.mse(torch.sqrt(best_boxes[:, 2:4]), torch.sqrt(box_target[:, 2:4]))
# Loss for confidence in cells with objects
obj_conf_loss = self.mse(best_boxes[:, 4], box_target[:, 4])
# Loss for confidence in cells without objects
noobj_pred_conf = noobj_preds[:, [4, 9]] # will change to torch.arange(4, 5 * self.B, 5) later
noobj_label_conf = noobj_labels[:, [4, 4]] # just [4] will work but pytorch gives broadcasting warning so whatever
noobj_conf_loss = self.mse(noobj_pred_conf, noobj_label_conf)
# Loss for classification in cells with objects
classification_loss = self.mse(obj_preds[:, 10:], obj_labels[:, 10:])
# FINAL LOSS
coord_loss = self.lambda_coord * (bbox_center_loss + bbox_dim_loss)
confidence_loss = obj_conf_loss + self.lambda_noobj * noobj_conf_loss
loss = (coord_loss + confidence_loss + classification_loss) / predictions.shape[0]
return loss
在 Kaggle 上训练之后,当我尝试像这样加载和测试模型时:
epochs = 10
device = torch.device("mps")
batch_size = 1
classes = ['horse', 'person', 'bottle', 'dog', 'tvmonitor', 'car', 'aeroplane', 'bicycle', 'boat', 'chair', 'diningtable', 'pottedplant', 'train', 'cat', 'sofa', 'bird', 'sheep', 'motorbike', 'bus', 'cow']
dataloader = get_dataset(batch_size)
loss_func = YoloLoss()
model = YOLOv1()
model = nn.DataParallel(model)
cp = torch.load("data/model.pt", map_location=torch.device('cpu'))
model.load_state_dict(cp)
model.eval()
lr = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.00005, nesterov=True)
i, o = next(iter(dataloader))
i2, o2 = next(iter(dataloader))
l1 = model(i)
l2 = model(i2)
l1 = l1[0]
l2 = l2[0]
l1 = l1[l1[:, :, 4] > 0.99] # :, :, 4 is object confidence for each cell
l2 = l2[l2[:, :, 4] > 0.99]
print(l1)
print(l2)
那些返回几乎完全相同的东西:
tensor([[0.7578, 0.5948, 0.4245, 0.5536, 0.9923, 0.3042, 0.4912, 0.2626, 0.3866,
0.9923, 0.0386, 0.3109, 0.0398, 0.0613, 0.0330, 0.0379, 0.0325, 0.0272,
0.0383, 0.0519, 0.0260, 0.0312, 0.0401, 0.0538, 0.0309, 0.0521, 0.0384,
0.0329, 0.0340, 0.0368],
[0.7746, 0.3583, 0.3543, 0.4720, 0.9911, 0.3432, 0.4972, 0.2896, 0.4189,
0.9912, 0.0489, 0.3794, 0.0399, 0.0676, 0.0323, 0.0391, 0.0310, 0.0372,
0.0384, 0.0603, 0.0328, 0.0350, 0.0359, 0.0419, 0.0374, 0.0477, 0.0429,
0.0350, 0.0318, 0.0371],
[0.4770, 0.6212, 0.6538, 0.6942, 0.9960, 0.5357, 0.2490, 0.4338, 0.4130,
0.9947, 0.0262, 0.1659, 0.0200, 0.0818, 0.0192, 0.0515, 0.0673, 0.0292,
0.0252, 0.0268, 0.0164, 0.0196, 0.0305, 0.0875, 0.0246, 0.0428, 0.0231,
0.0342, 0.0339, 0.0221],
[0.4482, 0.2498, 0.5434, 0.5681, 0.9936, 0.5363, 0.6450, 0.3235, 0.3887,
0.9932, 0.0366, 0.2673, 0.0321, 0.0518, 0.0249, 0.0485, 0.0317, 0.0305,
0.0306, 0.0411, 0.0354, 0.0250, 0.0310, 0.0429, 0.0506, 0.0445, 0.0311,
0.0358, 0.0272, 0.0313],
[0.2482, 0.6096, 0.4617, 0.5805, 0.9923, 0.6826, 0.5125, 0.2795, 0.4089,
0.9922, 0.0402, 0.3383, 0.0374, 0.0656, 0.0312, 0.0423, 0.0328, 0.0280,
0.0360, 0.0415, 0.0262, 0.0321, 0.0410, 0.0552, 0.0310, 0.0507, 0.0377,
0.0338, 0.0321, 0.0328],
[0.2101, 0.3151, 0.3860, 0.5073, 0.9912, 0.6172, 0.5605, 0.2832, 0.3956,
0.9913, 0.0448, 0.3436, 0.0400, 0.0695, 0.0332, 0.0468, 0.0330, 0.0358,
0.0352, 0.0812, 0.0325, 0.0345, 0.0323, 0.0488, 0.0380, 0.0403, 0.0478,
0.0377, 0.0313, 0.0354]], grad_fn=<IndexBackward0>)
tensor([[0.7578, 0.5948, 0.4245, 0.5536, 0.9923, 0.3042, 0.4912, 0.2626, 0.3866,
0.9923, 0.0386, 0.3109, 0.0398, 0.0613, 0.0330, 0.0379, 0.0325, 0.0272,
0.0383, 0.0519, 0.0260, 0.0312, 0.0401, 0.0538, 0.0309, 0.0521, 0.0384,
0.0329, 0.0340, 0.0368],
[0.7746, 0.3583, 0.3543, 0.4720, 0.9911, 0.3432, 0.4972, 0.2896, 0.4189,
0.9912, 0.0489, 0.3794, 0.0399, 0.0676, 0.0323, 0.0391, 0.0310, 0.0372,
0.0384, 0.0603, 0.0328, 0.0350, 0.0359, 0.0419, 0.0374, 0.0477, 0.0429,
0.0350, 0.0318, 0.0371],
[0.4770, 0.6212, 0.6538, 0.6942, 0.9960, 0.5357, 0.2490, 0.4338, 0.4130,
0.9947, 0.0262, 0.1659, 0.0200, 0.0818, 0.0192, 0.0515, 0.0673, 0.0292,
0.0252, 0.0268, 0.0164, 0.0196, 0.0305, 0.0875, 0.0246, 0.0428, 0.0231,
0.0342, 0.0339, 0.0221],
[0.4482, 0.2498, 0.5434, 0.5681, 0.9936, 0.5363, 0.6450, 0.3235, 0.3887,
0.9932, 0.0366, 0.2673, 0.0321, 0.0518, 0.0249, 0.0485, 0.0317, 0.0305,
0.0306, 0.0411, 0.0354, 0.0250, 0.0310, 0.0429, 0.0506, 0.0445, 0.0311,
0.0358, 0.0272, 0.0313],
[0.2482, 0.6096, 0.4617, 0.5805, 0.9923, 0.6826, 0.5125, 0.2795, 0.4089,
0.9922, 0.0402, 0.3383, 0.0374, 0.0656, 0.0312, 0.0423, 0.0328, 0.0280,
0.0360, 0.0415, 0.0262, 0.0321, 0.0410, 0.0552, 0.0310, 0.0507, 0.0377,
0.0338, 0.0321, 0.0328],
[0.2101, 0.3151, 0.3860, 0.5073, 0.9912, 0.6172, 0.5605, 0.2832, 0.3956,
0.9913, 0.0448, 0.3436, 0.0400, 0.0695, 0.0332, 0.0468, 0.0330, 0.0358,
0.0352, 0.0812, 0.0325, 0.0345, 0.0323, 0.0488, 0.0380, 0.0403, 0.0478,
0.0377, 0.0313, 0.0354]], grad_fn=<IndexBackward0>)
甚至随机噪声也是一样的:
我知道我不是每次都在模型中意外输入相同的图像,因为我手动检查过。我觉得可能出了什么问题,即使训练完全失败,也不应该每次都输出几乎完全相同的东西。