我正在尝试使用 PyTorch 构建一个 Conv-LSTM 网络,模型非常类似于图像标题生成器,该模型在训练时可以很好地学习预测单词,但在推理过程中不会预测任何内容(在调用 def 样本时) ,你能告诉我我哪里做得不对吗?
这是 LSTM 网络的代码
class LSTM(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size):
super(LSTM, self).__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
self.hidden_size = hidden_size
self.embeddings = nn.Embedding(vocab_size , embed_size)
self.model = nn.LSTM(self.embed_size, self.hidden_size, batch_first=True)
self.dropout = nn.Dropout(0.5)
self.ln = nn.Linear(self.hidden_size, self.vocab_size)
self.conv = Conv()
def forward(self , features, sentence):
features = self.conv(features)
embedding = self.embeddings(sentence)
embedding = torch.cat((features.unsqueeze(1), embedding) , dim = 1 )
embedding = self.dropout(embedding)
embedding = embedding.cuda()
# print(embedding.shape)
out , _ = self.model(embedding)
# print("..." , out.shape)
lipRead = self.ln(out)
return lipRead
def sample(self , x , max_length):
words = []
with torch.no_grad():
features = self.conv(x)
print("df" , features.shape) # batch , 10
state = None
for n in range(36):
out , state = self.model(features.unsqueeze(1) , state)
# print(out.shape) # batch_size , 1 , 10
out = self.ln(out)
# print(out)
# print(out.shape) # batch , 1 , 40
pred = out.squeeze(1).argmax(1)
# print("predicted" , pred)
# print("pred shape:" , pred.shape) # batch
words.append(pred[2])
features = self.embeddings(pred).to(device)
# print(features.shape) #batch , 10
return words
我认为 self.ln 或 self.model 无法预测正确的输出,但如果是这样,那么为什么模型在训练时学会预测正确的单词。
这是我的推理方式,你可以看到预测值没有意义。
for j , (x , y) in enumerate(val_data):
x = x.type(torch.cuda.FloatTensor)
x = x.to(device)
y = torch.from_numpy(y)
y = y.to(device)
print("real value: " , y[2])
words = model.sample(x , 36)
print("predicted: " , words)
输出
real value: tensor([ 1, 8, 13, 39, 22, 7, 8, 19, 4, 39, 1, 24, 39, 6, 39, 13, 8, 13,
4, 39, 18, 14, 14, 13, 39, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38],
device='cuda:0')
df torch.Size([16, 10])
predicted: [tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0'), tensor(39, device='cuda:0')]
您可以查看此笔记本来重现该问题。
在训练和测试过程中,您将使用两个不同的输入向前模型。 在训练过程中,输入是特征和句子的猫嵌入。 该模型使用这些信息进行训练,它期望在测试中输入相同的输入。