我很难理解图像分割。我已经实现了用于图像分割的 Unet 模型。我正在使用 PASCAL VOC 数据集,并且正在尝试训练我的模型。然而,在计算损失时,我陷入了困境。我不确定输出和目标类的预期形状应该是什么。有人可以教育我我做错了什么吗?我唯一的猜测是,当涉及到真实图像时,我遗漏了一些东西,因为我不知道模型将如何学习哪个类别是哪个类别。谢谢!
这是我的 Unet 课程:
import torch
import torch.nn as nn
from torchvision import transforms
def x2conv(in_channels, out_channels):
double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=0),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=0),
nn.ReLU(inplace=True))
return double_conv
class Encoder(nn.Module):
def __init__(self, chs):
super().__init__()
self.enc_blocks = nn.ModuleList(
[x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
def forward(self, x):
ftrs = []
for block in self.enc_blocks:
x = block(x)
ftrs.append(x)
x = self.pool(x)
return ftrs
class Decoder(nn.Module):
def __init__(self, chs):
super().__init__()
self.chs = chs
self.upconvs = nn.ModuleList(
[nn.ConvTranspose2d(chs[i], chs[i+1], kernel_size=2, stride=2) for i in range(len(chs)-1)])
self.dec_blocks = nn.ModuleList(
[x2conv(chs[i], chs[i+1]) for i in range(len(chs)-1)])
def forward(self, x, encoder_features):
for i in range(len(self.chs)-1):
x = self.upconvs[i](x)
enc_ftrs = self.crop(encoder_features[i], x)
x = torch.cat([x, enc_ftrs], dim=1)
x = self.dec_blocks[i](x)
return x
def crop(self, enc_ftrs, x):
_, _, H, W = x.shape
enc_ftrs = transforms.CenterCrop([H, W])(enc_ftrs)
return enc_ftrs
class UNet(nn.Module):
def __init__(self, enc_chs, dec_chs, num_class):
super(UNet, self).__init__()
self.encoder = Encoder(enc_chs)
self.decoder = Decoder(dec_chs)
self.softmax = nn.Conv2d(dec_chs[-1], num_class, kernel_size=1)
def forward(self, x):
enc_ftrs = self.encoder(x)
out = self.decoder(enc_ftrs[::-1][0], enc_ftrs[::-1][1:])
out = self.softmax(out)
return out
这是我的数据集类:
from PIL import Image
import torchvision
VOC_CLASSES = [ # How to use?
"background",
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
]
VOC_COLORMAP = [ # How to use?
[0, 0, 0], # Background
[128, 0, 0], # Aeroplane
[0, 128, 0], # Bicycle
[128, 128, 0], # Bird
[0, 0, 128], # Boat
[128, 0, 128], # Bottle
[0, 128, 128], # Bus
[128, 128, 128], # Car
[64, 0, 0], # Cat
[192, 0, 0], # Chair
[64, 128, 0], # Cow
[192, 128, 0], # Diningtable
[64, 0, 128], # Dog
[192, 0, 128], # Horse
[64, 128, 128], # Motorbike
[192, 128, 128], # Person
[0, 64, 0], # Pottedplant
[128, 64, 0], # Sheep
[0, 192, 0], # Sofa
[128, 192, 0], # Train
[0, 64, 128], # tvmonitor
]
class VocDataset(torchvision.datasets.VOCSegmentation):
def __init__(self, image_set, transform, root="../data/VOCtrainval_11-May-2012/", download=False, year="2012"):
self.transform = transform
self.year = year
super().__init__(root=root, image_set=image_set,
download=download, transform=transform, year=year)
def __len__(self):
return len(self.images)
def __getitem__(self, index):
# open images and do transformation img = jpg, mask = png
img = Image.open(self.images[index]).convert("RGB")
target = Image.open(self.masks[index]).convert("RGB")
if self.transform:
img = self.transform(img)
trfm = T.Compose([T.ToTensor(), T.Resize((388, 388))])
target = trfm(target)
return img, target
最后这是我的火车功能
import torch
import torch.nn as nn
import torch.optim as optim
from unet import UNet
from torch.utils.data import DataLoader
from dataset import VocDataset
import torchvision.transforms as T
import torch.nn.functional as F
# Hyperparameters etc.
STD = [0.2686, 0.2652, 0.2812] # Std for dataset
MEAN = [0.4568, 0.4431, 0.4083] # Mean for dataset
MOMENTUM = 0.9
LEARNING_RATE = 1e-4
BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_WORKERS = 2
NUM_CLASSES = 20
TRAIN_SET = "train"
VAL_SET = "val"
ENC_CHANNELS = (3, 64, 128, 256, 512, 1024) # Encoder channels
DEC_CHANNELS = (1024, 512, 256, 128, 64) # Decoder channels
TRANSFORM = T.Compose(
[T.ToTensor(), T.Resize(SIZE), T.Normalize(MEAN, STD)]
)
def main():
training_data = VocDataset(TRAIN_SET, TRANSFORM)
train_dataloader = DataLoader(
training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, drop_last=True)
# Create instance of unet
unet = UNet(ENC_CHANNELS, DEC_CHANNELS, NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
unet.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)
for epoch in range(NUM_EPOCHS): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(train_dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data # Shape for labels and inputs are: [32,3,388,388]
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = unet(inputs) # output shape is [32, 32, 388, 388]
loss = criterion(outputs, labels) # Error here
loss.backward()
optimizer.step()
# print('Finished Training')
if __name__ == "__main__":
main()
对于初学者来说,您的标签和输出具有不同的维度。 (32 与 3 通道)。交叉熵损失期望它们要么具有相同数量的通道,要么目标只有一个通道,其中整数值指示相关类别。
让我们处理后一种情况。在这种情况下,我们需要根据您的输入和批量大小将目标减少为单个通道
[32 x 388 x 388]
。 (其次,理想情况下,Unet 应该为每个类都有一个输出通道(看起来有 22 个类,因此您应该将 Unet 解码器的最终输出层更改为具有 22 个输出)。
要将尺寸为
[32 x 3 x 388 x 388]
的标签转换为[32 x 388 x 388]
,需要使用colormap进行转换。也就是说,创建一个大小为 target
的新张量 [32 x 1 x 388 x 388]
。对于每个值 target[i,j,k]
,将索引分配到 VOC_COLORMAP
中,该索引与存储在 label[i,:,j,k]
处的像素中的值相匹配。
它发生在计算损失时。阅读Pytorch文档我认为你只需要改变模型输出的形状。
输入:形状 (C), (N,C) 或 (N、C、d 1 ,d 2 ,...,d K ) 和 K维损失情况下K≥1。
目标:如果包含类索引、形状 (), (N) 或 (N,d 1 ,d 2 ,...,d K ) 和 K≥1 在 K 维损失的情况下,每个值应在 [0,C)。如果包含类概率,则与输入形状相同,并且每个值应介于 [ 0 , 1 ] [0,1].
输出:如果缩减为“无”,则形状 (), (N) 或 (N,d 1 ,d 2 ,...,d K ) 和 K维损失情况下K≥1,取决于输入的形状。否则,标量。
哪里: C=班级数量, N=批量大小