class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.bn1 = torch.nn.BatchNorm2d(num_features=3)
self.conv1 = torch.nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
self.act1 = torch.nn.ReLU()
self.pool1 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
# self.dr1 = torch.nn.Dropout2d(0.1)
self.bn2 = torch.nn.BatchNorm2d(num_features=16)
self.conv2 = torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
self.act2 = torch.nn.ReLU()
self.pool2 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
# self.dr2 = torch.nn.Dropout2d(0.1)
self.bn3 = torch.nn.BatchNorm2d(num_features=32)
self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
self.act3 = torch.nn.ReLU()
self.pool3 = torch.nn.MaxPool2d(kernel_size=2, stride=2)
# self.dr3 = torch.nn.Dropout2d(0.1)
self.bn4 = torch.nn.BatchNorm1d(num_features=4 * 4 * 64)
self.fc4 = torch.nn.Linear(4 * 4 * 64, 256)
self.act4 = torch.nn.Tanh()
# self.dr4 = torch.nn.Dropout1d(0.1)
self.bn5 = torch.nn.BatchNorm1d(num_features=256)
self.fc5 = torch.nn.Linear(256, 64)
self.act5 = torch.nn.Tanh()
# self.dr5 = torch.nn.Dropout1d(0.1)
self.fc6 = torch.nn.Linear(64, 10)
def forward(self, x):
x = self.bn1(x)
x = self.conv1(x)
x = self.act1(x)
x = self.pool1(x)
# x = self.dr1(x)
x = self.bn2(x)
x = self.conv2(x)
x = self.act2(x)
x = self.pool2(x)
# x = self.dr2(x)
x = self.bn3(x)
x = self.conv3(x)
x = self.act3(x)
x = self.pool3(x)
# x = self.dr3(x)
x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
x = self.bn4(x)
x = self.fc4(x)
x = self.act4(x)
# x = self.dr4(x)
x = self.bn5(x)
x = self.fc5(x)
x = self.act5(x)
# x = self.dr5(x)
x = self.fc6(x)
return x
last_model = Net()
#%%
def conv_block(in_f, out_f, activation='relu', *args, **kwargs):
activations = nn.ModuleDict([
['tanh', nn.Tanh()],
['relu', nn.ReLU()]
])
return nn.Sequential(
nn.BatchNorm2d(in_f),
nn.Conv2d(in_f, out_f, *args, **kwargs),
activations[activation],
nn.MaxPool2d(kernel_size=2, stride=2),
# nn.Dropout2d(0.1)
)
class MyEncoder(nn.Module):
def __init__(self, enc_sizes, *args, **kwargs):
super().__init__()
self.conv_blokcs = nn.Sequential(*[conv_block(in_f,
out_f, kernel_size=3, padding=1, *args, **kwargs)
for in_f, out_f in zip(enc_sizes, enc_sizes[1:])])
def forward(self, x):
return self.conv_blokcs(x)
def dec_block(in_f, out_f):
return nn.Sequential(
nn.BatchNorm1d(in_f),
nn.Linear(in_f, out_f),
nn.Tanh(),
# nn.Dropout1d(0.1)
)
class MyDecoder(nn.Module):
def __init__(self, dec_sizes, n_classes):
super().__init__()
self.dec_blocks = nn.Sequential(*[dec_block(in_f, out_f)
for in_f, out_f in zip(dec_sizes, dec_sizes[1:])])
self.last = nn.Linear(dec_sizes[-1], n_classes)
def forward(self, x):
return self.dec_blocks(x)
class MyNET(nn.Module):
def __init__(self, in_c, enc_sizes, dec_sizes, n_classes, activation='relu'):
super().__init__()
self.enc_sizes = [in_c, *enc_sizes]
l = 32 / (2 ** len(enc_sizes))
# print(enc_sizes[-1] * l * l)
self.dec_sizes = [int(enc_sizes[-1] * l * l), *dec_sizes]
self.encoder = MyEncoder(self.enc_sizes, activation=activation)
self.decoder = MyDecoder(self.dec_sizes, n_classes)
def forward(self, x):
x = self.encoder(x)
x = x.view(x.size(0), x.size(1) * x.size(2) * x.size(3))
x = self.decoder(x)
return x
my_nodel = MyNET(3, [16, 32, 64], [256, 64], 10, activation='relu')
以及 5 个时期 CIFAR10 的结果:
火车功能:
def train(net, X_train, y_train, X_test, y_test):
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = net.to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1.0e-3, weight_decay=1e-5)
batch_size = 100
test_accuracy_history = []
test_loss_history = []
X_test = X_test.to(device)
y_test = y_test.to(device)
for epoch in range(5):
order = np.random.permutation(len(X_train))
for start_index in range(0, len(X_train), batch_size):
optimizer.zero_grad()
net.train()
batch_indexes = order[start_index:start_index+batch_size]
X_batch = X_train[batch_indexes].to(device)
y_batch = y_train[batch_indexes].to(device).view(-1)
preds = net.forward(X_batch)
loss_value = loss(preds, y_batch)
loss_value.backward()
optimizer.step()
net.eval()
test_preds = net.forward(X_test)
test_loss_history.append(loss(test_preds, y_test.squeeze()).data.cpu())
accuracy = (test_preds.argmax(dim=1) == y_test).float().mean().data.cpu()
test_accuracy_history.append(accuracy)
print(accuracy)
print('---------------')
return test_accuracy_history, test_loss_history
我希望这些是相同的神经网络,并且它们会产生相同的结果。我认为问题出在训练本身,但如果你先训练第二个,然后再训练第一个,结果是一样的。在代码中,我专门禁用了 dropout,这样它就不会意外关闭神经元(尽管随机种子是相同的)。也许问题在于,当计算梯度时,它们的计算方式与通常的形式不同???
您的
forward
模块中的 MyDecoder
方法会跳过最后的线性层。
如所写,第一个模型产生大小为
(bs, 10)
的输出,而第二个模型产生大小为 (bs, 64)
的输出。