我正在使用卷积自动编码器重建形状为
(1, 100, 592)
的频谱图,但它返回的输出形状为(1, 90, 586)
我的编码器是:
class Encoder(nn.Module):
def __init__(self, latent_dim):
super(Encoder, self).__init__()
# define convolutional layers for encoding
#! in_channels is 1 because the spectogram is only 1 channel
self.conv1 = nn.Conv2d(in_channels=1, out_channels=512, kernel_size=(5, 5), stride=(2, 2))
self.batch1 = nn.BatchNorm2d(512)
self.conv2 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=(3, 3), stride=(2, 2))
self.batch2 = nn.BatchNorm2d(256)
self.conv3 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=(3, 3), stride=(2, 2))
self.batch3 = nn.BatchNorm2d(128)
self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=(2, 2), stride=(2, 2))
self.batch4 = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=(1, 1), stride=(1, 1))
# define latent space
self.z = nn.Linear(out_size_len , latent_dim)
def forward(self, x):
x = x.to(device) # move input to GPU
x = F.relu(self.batch1(self.conv1(x))) # apply convolutional layer, batch norm, and ReLU activation
x = F.relu(self.batch2(self.conv2(x)))
x = F.relu(self.batch3(self.conv3(x)))
x = F.relu(self.batch4(self.conv4(x)))
x = F.relu(self.conv5(x))
x = torch.flatten(x, start_dim=1) # flatten output for linear layers
z = F.relu(self.z(x))
return z
# ! This is the exercise below
def get_output_size(self, input_shape):
x = torch.zeros(input_shape)
x = x.to(device)
x = self.conv1(x)
x = self.batch1(x)
x = self.conv2(x)
x = self.batch2(x)
x = self.conv3(x)
x = self.batch3(x)
x = self.conv4(x)
x = self.batch4(x)
x = self.conv5(x)
output_size = x.size()[1:]
return output_size
解码器是:
class Decoder(nn.Module):
def __init__(self, latent_dims):
super().__init__()
# linear layer to map latent code to 3D tensor
self.decoder_lin = nn.Sequential(
nn.Linear(latent_dims, 128),
nn.ReLU(True),
#! The next is the size obtained by the get_output_size
nn.Linear(128, out_size_len),
nn.ReLU(True)
)
# unflatten 3D tensor to 4D tensor
self.unflatten = nn.Unflatten(dim=1, unflattened_size=(int(out_size[0]), int(out_size[1]), int(out_size[2])))
# transposed convolutional layers to gradually increase spatial resolution
self.dec1 = nn.ConvTranspose2d(in_channels=32, out_channels=64, kernel_size=(1, 1), stride=(1, 1))
self.batch1 = nn.BatchNorm2d(64)
self.dec2 = nn.ConvTranspose2d(in_channels=64, out_channels=128, kernel_size=(2, 2), stride=(2, 2))
self.batch2 = nn.BatchNorm2d(128)
self.dec3 = nn.ConvTranspose2d(in_channels=128, out_channels=256, kernel_size=(3, 3), stride=(2, 2))
self.batch3 = nn.BatchNorm2d(256)
self.dec4 = nn.ConvTranspose2d(in_channels=256, out_channels=512, kernel_size=(3, 3), stride=(2, 2), output_padding=1)
self.batch4 = nn.BatchNorm2d(512)
self.dec5 = nn.ConvTranspose2d(in_channels=512, out_channels=1, kernel_size=(3, 3), stride=(2, 2), output_padding=1)
def forward(self, x):
# linear layer to map latent code to 3D tensor
x = self.decoder_lin(x)
# unflatten 3D tensor to 4D tensor
x = self.unflatten(x)
# transposed convolutional layers to gradually increase spatial resolution
x = F.relu(self.batch1(self.dec1(x)))
x = F.relu(self.batch2(self.dec2(x)))
x = F.relu(self.batch3(self.dec3(x)))
x = F.relu(self.batch4(self.dec4(x)))
x = self.dec5(x)
# apply sigmoid activation function to ensure output is between 0 and 1
x = torch.sigmoid(x)
return x
编码器-解码器:
class Enc_Dec(nn.Module):
def __init__(self, latent_dims):
super(Enc_Dec, self).__init__()
self.encoder = Encoder(latent_dims)
self.decoder = Decoder(latent_dims)
def forward(self, x):
x = x.to(device)
z = self.encoder(x)
return self.decoder(z)
所以,为了调查,我使用了 torchsummary,输出如下:
from torchsummary import summary
summary(Enc_Dec(latent_dims=20), X[0].shape)
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 512, 48, 294] 13,312
BatchNorm2d-2 [-1, 512, 48, 294] 1,024
Conv2d-3 [-1, 256, 23, 146] 1,179,904
BatchNorm2d-4 [-1, 256, 23, 146] 512
Conv2d-5 [-1, 128, 11, 72] 295,040
BatchNorm2d-6 [-1, 128, 11, 72] 256
Conv2d-7 [-1, 64, 5, 36] 32,832
BatchNorm2d-8 [-1, 64, 5, 36] 128
Conv2d-9 [-1, 32, 5, 36] 2,080
Linear-10 [-1, 20] 115,220
Encoder-11 [-1, 20] 0
Linear-12 [-1, 128] 2,688
ReLU-13 [-1, 128] 0
Linear-14 [-1, 5760] 743,040
ReLU-15 [-1, 5760] 0
Unflatten-16 [-1, 32, 5, 36] 0
ConvTranspose2d-17 [-1, 64, 5, 36] 2,112
BatchNorm2d-18 [-1, 64, 5, 36] 128
ConvTranspose2d-19 [-1, 128, 10, 72] 32,896
BatchNorm2d-20 [-1, 128, 10, 72] 256
ConvTranspose2d-21 [-1, 256, 21, 145] 295,168
BatchNorm2d-22 [-1, 256, 21, 145] 512
ConvTranspose2d-23 [-1, 512, 44, 292] 1,180,160
BatchNorm2d-24 [-1, 512, 44, 292] 1,024
ConvTranspose2d-25 [-1, 1, 90, 586] 4,609
Decoder-26 [-1, 1, 90, 586] 0
================================================================
Total params: 3,902,901
Trainable params: 3,902,901
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.23
Forward/backward pass size (MB): 239.92
Params size (MB): 14.89
Estimated Total Size (MB): 255.04
----------------------------------------------------------------
我看到差异从
ConvTranspose2d
的第二个 Decoder
开始。因此,我开始切换步幅和填充的一些值,但我没有达到所需的输出形状。
有谁知道我该如何解决这个问题?