使用梯度下降训练神经网络时出现 NaN 值

Question

我正在尝试从头开始使用梯度下降来实现神经网络。基本上，我有一个二维输入矩阵，其第 i 个条目是一个输入向量，我在用于训练网络的训练方法中使用神经网络类的三种方法 forward_pass、backward_pass 和 update。

但是，我的问题是每次我尝试在实际数据上执行我的代码时，我最终都会得到垃圾 NaN 值，因为 A_vals 由于某种原因而爆炸（即变得非常大）。我在测试时使用了 ReLU 激活函数，但这并没有解决我的问题。下面是一些相关的代码。

class NeuralNetwork():
# assume an init method is present
    def forward_pass(self, X) -> (List[np.ndarray], List[np.ndarray]):
        '''
        Executes the forward pass of the network on a dataset of n examples with f features. Inputs are fed into the
        first layer. Each layer computes Z_i = g(A_i) = g(Z_{i-1}W[i]).
        :param X: The training set, with size (n, f)
        :return A_vals: a list of a-values for each example in the dataset. There are n_layers items in the list and
                        each item is an array of size (n, layer_sizes[i])
                Z_vals: a list of z-values for each example in the dataset. There are n_layers items in the list and
                        each item is an array of size (n, layer_sizes[i])
        '''
        A_vals = []
        Z_vals = []
        # for i in range(0, X.shape[0]):

        for layer in range(0, self.n_layers):
            if layer == 0:
                prev_Z_vals = X
            else:
                prev_Z_vals = Z_vals[layer - 1]
            ones = np.ones((prev_Z_vals.shape[0],1))
            prev_Z_vals = np.concatenate((ones, prev_Z_vals),axis=1) 

            A_vals.append(np.matmul(prev_Z_vals, self.W[layer])) 
            Z_vals.append(self.activations[layer].value(A_vals[layer]))
        

        return A_vals, Z_vals


    # we assume we do not compute delta values for the biases of the weights
    def backward_pass(self, A_vals, dLdyhat) -> List[np.ndarray]:
        '''
        Executes the backward pass of the network on a dataset of n examples with f features. The delta values are
        computed from the end of the network to the front.
        :param A_vals: a list of a-values for each example in the dataset. There are n_layers items in the list and
                       each item is an array of size (n, layer_sizes[i])
        :param dLdyhat: The derivative of the loss with respect to the predictions (y_hat), with shape (n, layer_sizes[-1])
        :return deltas: A list of delta values for each layer. There are n_layers items in the list and
                        each item is an array of size (n, layer_sizes[i])
        '''

        deltas = []
        deriv = self.activations[self.n_layers - 1].derivative(A_vals[-1])
        deltas.append(dLdyhat * deriv)
        for layer in range(self.n_layers - 2, -1, -1):
            deltas.append(self.activations[layer].derivative(A_vals[layer]) * np.dot(deltas[-1], self.W[layer + 1][1:].T))
            
        return deltas[::-1]

    def update_weights(self, X, Z_vals, deltas) -> List[np.ndarray]:
        '''
        Having computed the delta values from the backward pass, update each weight with the sum over the training
        examples of the gradient of the loss with respect to the weight.
        :param X: The training set, with size (n, f)
        :param Z_vals: a list of z-values for each example in the dataset. There are n_layers items in the list and
                       each item is an array of size (n, layer_sizes[i])
        :param deltas: A list of delta values for each layer. There are n_layers items in the list and
                       each item is an array of size (n, layer_sizes[i])
        :return W: The newly updated weights (i.e. self.W)
        '''

        ones = np.zeros((X.shape[0],1))
        X = np.concatenate((ones, X),axis=1)
        self.W[0]-= self.learning_rate * np.matmul(X.T, deltas[0])
        for layer in range(1, self.n_layers):
            ones = np.ones((Z_vals[layer - 1].shape[0],1))
            Z_vals_ones_added = np.concatenate((ones, Z_vals[layer - 1]),axis=1)
            self.W[layer] -= self.learning_rate * np.matmul(Z_vals_ones_added.T,deltas[layer])
        return self.W

    def train(self, X: np.ndarray, y: np.ndarray, epochs: int) -> (List[np.ndarray], List[float]):
        '''
        Trains the neural network model on a labelled dataset.
        :param X: The training set, with size (n, f)
        :param y: The targets for each example, with size (n, 1)
        :param epochs: The number of epochs to train the model
        :return W: The trained weights
                epoch_losses: A list of the training losses in each epoch
        '''

        epoch_losses = []
        for epoch in range(epochs):
            A_vals, Z_vals = self.forward_pass(X)   # Execute forward pass
            print("after calling forward pass:")
            print("A_vals:")
            print(A_vals)
            print("Z_vals:")
            print(Z_vals)
            y_hat = Z_vals[-1]                      # Get predictions
            L = self.loss.value(y_hat, y)           # Compute the loss
            print("Epoch {}/{}: Loss={}".format(epoch, epochs, L))
            epoch_losses.append(L)                  # Keep track of the loss for each epoch

            dLdyhat = self.loss.derivative(y_hat, y)         # Calculate derivative of the loss with respect to output
            deltas = self.backward_pass(A_vals, dLdyhat)     # Execute the backward pass to compute the deltas
            print("deltas after the backward pass:")
            print(deltas)
            self.W = self.update_weights(X, Z_vals, deltas)  # Calculate the gradients and update the weights
            print("weights after the update:")
            print(self.W)
        return self.W, epoch_losses

使用梯度下降训练神经网络时出现 NaN 值

问题描述投票：0回答：0

最新问题

使用梯度下降训练神经网络时出现 NaN 值

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0