我正在尝试借助此存储库(https://github.com/nicklashansen/rnn_lstm_from_scratch/tree/master)从头开始构建 RNN,但每个时期后的训练损失保持不变。训练循环的代码如下:
# Hyper-parameters
num_epochs = 1000
# Initialize a new network
params = init_rnn(hidden_size=hidden_size, vocab_size=vocab_size)
# Initialize hidden state as zeros
hidden_state = np.zeros((hidden_size, 1))
# Track loss
training_loss, validation_loss = [], []
def check_if_params_updated(old_params, new_params):
# This function checks if two sets of parameters are different
for old_param, new_param in zip(old_params, new_params):
if not np.array_equal(old_param, new_param):
return True # Parameters have been updated
return False # Parameters have not been updated
# For each epoch
for i in range(num_epochs):
# Track loss
epoch_training_loss = 0
epoch_validation_loss = 0
# For each sentence in validation set
for inputs, targets in val_loader:
# One-hot encode input and target sequence
inputs_one_hot = one_hot_encode_sequence(inputs, vocab_size)
targets_one_hot = one_hot_encode_sequence(targets, vocab_size)
# Re-initialize hidden state
hidden_state = np.zeros_like(hidden_state)
# Forward pass
outputs, hidden_states = forward_pass(inputs_one_hot, hidden_state, params)
# Backward pass
loss, _ = backward_pass(inputs_one_hot, outputs, hidden_states, targets_one_hot, params)
# Update loss
epoch_validation_loss += loss
# For each sentence in training set
for inputs, targets in train_loader:
# One-hot encode input and target sequence
inputs_one_hot = one_hot_encode_sequence(inputs, vocab_size)
targets_one_hot = one_hot_encode_sequence(targets, vocab_size)
# Re-initialize hidden state
hidden_state = np.zeros_like(hidden_state)
# Forward pass
outputs, hidden_states = forward_pass(inputs_one_hot, hidden_state, params)
# Backward pass
loss, grads = backward_pass(inputs_one_hot, outputs, hidden_states, targets_one_hot, params)
print(inputs_one_hot.shape)
if np.isnan(loss):
raise ValueError('Gradients have vanished/exploded!')
# Update parameters
params = update_parameters(params, grads, lr=1e-3)
# Update loss
epoch_training_loss += loss
# Save loss for plot
training_loss.append(epoch_training_loss/len(training_set))
validation_loss.append(epoch_validation_loss/len(validation_set))
# Print loss every 100 epochs
if i % 100 == 0:
print(f'Epoch {i}, training loss: {training_loss[-1]}, validation loss: {validation_loss[-1]}')
# Get first sentence in test set
inputs, targets = test_set[1]
# One-hot encode input and target sequence
inputs_one_hot = one_hot_encode_sequence(inputs, vocab_size)
targets_one_hot = one_hot_encode_sequence(targets, vocab_size)
# Initialize hidden state as zeros
hidden_state = np.zeros((hidden_size, 1))
# Forward pass
outputs, hidden_states = forward_pass(inputs_one_hot, hidden_state, params)
output_sentence = [idx_to_word[np.argmax(output)] for output in outputs]
print('Input sentence:')
print(inputs)
print('\nTarget sequence:')
print(targets)
print('\nPredicted sequence:')
print([idx_to_word[np.argmax(output)] for output in outputs])
# Plot training and validation loss
epoch = np.arange(len(training_loss))
plt.figure()
plt.plot(epoch, training_loss, 'r', label='Training loss',)
plt.plot(epoch, validation_loss, 'b', label='Validation loss')
plt.legend()
plt.xlabel('Epoch'), plt.ylabel('NLL')
plt.show()
我尝试检查我的参数是否正在更新,它们确实更新了,还尝试检查梯度,它们并不是指数小。每次迭代后损失都会减少,但总纪元的损失保持不变。您可以在存储库中找到完整的代码,其中包括前向和后向传递(https://github.com/dangerdude237/RNN_From_Scratch)。
我认为您将总损失除以验证集中的批次数,但您没有对训练集执行此操作。您应该相应地更新代码:
epoch_training_loss += loss training_loss.append(epoch_training_loss / len(train_loader)) validation_loss.append(epoch_validation_loss / len(val_loader))
如果这不起作用,那么隐藏阶段初始化可能存在问题,您应该在每个纪元开始时将隐藏阶段初始化为零。 hidden_state = np.zeros((hidden_size, 1))