我正在尝试从头开始使用梯度下降来实现神经网络。基本上,我有一个二维输入矩阵,其第 i 个条目是一个输入向量,我在用于训练网络的训练方法中使用神经网络类的三种方法 forward_pass、backward_pass 和 update。
但是,我的问题是每次我尝试在实际数据上执行我的代码时,我最终都会得到垃圾 NaN 值,因为 A_vals 由于某种原因而爆炸(即变得非常大)。我在测试时使用了 ReLU 激活函数,但这并没有解决我的问题。下面是一些相关的代码。
class NeuralNetwork():
# assume an init method is present
def forward_pass(self, X) -> (List[np.ndarray], List[np.ndarray]):
'''
Executes the forward pass of the network on a dataset of n examples with f features. Inputs are fed into the
first layer. Each layer computes Z_i = g(A_i) = g(Z_{i-1}W[i]).
:param X: The training set, with size (n, f)
:return A_vals: a list of a-values for each example in the dataset. There are n_layers items in the list and
each item is an array of size (n, layer_sizes[i])
Z_vals: a list of z-values for each example in the dataset. There are n_layers items in the list and
each item is an array of size (n, layer_sizes[i])
'''
A_vals = []
Z_vals = []
# for i in range(0, X.shape[0]):
for layer in range(0, self.n_layers):
if layer == 0:
prev_Z_vals = X
else:
prev_Z_vals = Z_vals[layer - 1]
ones = np.ones((prev_Z_vals.shape[0],1))
prev_Z_vals = np.concatenate((ones, prev_Z_vals),axis=1)
A_vals.append(np.matmul(prev_Z_vals, self.W[layer]))
Z_vals.append(self.activations[layer].value(A_vals[layer]))
return A_vals, Z_vals
# we assume we do not compute delta values for the biases of the weights
def backward_pass(self, A_vals, dLdyhat) -> List[np.ndarray]:
'''
Executes the backward pass of the network on a dataset of n examples with f features. The delta values are
computed from the end of the network to the front.
:param A_vals: a list of a-values for each example in the dataset. There are n_layers items in the list and
each item is an array of size (n, layer_sizes[i])
:param dLdyhat: The derivative of the loss with respect to the predictions (y_hat), with shape (n, layer_sizes[-1])
:return deltas: A list of delta values for each layer. There are n_layers items in the list and
each item is an array of size (n, layer_sizes[i])
'''
deltas = []
deriv = self.activations[self.n_layers - 1].derivative(A_vals[-1])
deltas.append(dLdyhat * deriv)
for layer in range(self.n_layers - 2, -1, -1):
deltas.append(self.activations[layer].derivative(A_vals[layer]) * np.dot(deltas[-1], self.W[layer + 1][1:].T))
return deltas[::-1]
def update_weights(self, X, Z_vals, deltas) -> List[np.ndarray]:
'''
Having computed the delta values from the backward pass, update each weight with the sum over the training
examples of the gradient of the loss with respect to the weight.
:param X: The training set, with size (n, f)
:param Z_vals: a list of z-values for each example in the dataset. There are n_layers items in the list and
each item is an array of size (n, layer_sizes[i])
:param deltas: A list of delta values for each layer. There are n_layers items in the list and
each item is an array of size (n, layer_sizes[i])
:return W: The newly updated weights (i.e. self.W)
'''
ones = np.zeros((X.shape[0],1))
X = np.concatenate((ones, X),axis=1)
self.W[0]-= self.learning_rate * np.matmul(X.T, deltas[0])
for layer in range(1, self.n_layers):
ones = np.ones((Z_vals[layer - 1].shape[0],1))
Z_vals_ones_added = np.concatenate((ones, Z_vals[layer - 1]),axis=1)
self.W[layer] -= self.learning_rate * np.matmul(Z_vals_ones_added.T,deltas[layer])
return self.W
def train(self, X: np.ndarray, y: np.ndarray, epochs: int) -> (List[np.ndarray], List[float]):
'''
Trains the neural network model on a labelled dataset.
:param X: The training set, with size (n, f)
:param y: The targets for each example, with size (n, 1)
:param epochs: The number of epochs to train the model
:return W: The trained weights
epoch_losses: A list of the training losses in each epoch
'''
epoch_losses = []
for epoch in range(epochs):
A_vals, Z_vals = self.forward_pass(X) # Execute forward pass
print("after calling forward pass:")
print("A_vals:")
print(A_vals)
print("Z_vals:")
print(Z_vals)
y_hat = Z_vals[-1] # Get predictions
L = self.loss.value(y_hat, y) # Compute the loss
print("Epoch {}/{}: Loss={}".format(epoch, epochs, L))
epoch_losses.append(L) # Keep track of the loss for each epoch
dLdyhat = self.loss.derivative(y_hat, y) # Calculate derivative of the loss with respect to output
deltas = self.backward_pass(A_vals, dLdyhat) # Execute the backward pass to compute the deltas
print("deltas after the backward pass:")
print(deltas)
self.W = self.update_weights(X, Z_vals, deltas) # Calculate the gradients and update the weights
print("weights after the update:")
print(self.W)
return self.W, epoch_losses