我的神经网络仅在第一次迭代中工作,然后成本函数返回 NAN

问题描述 投票:0回答:1

我有一个神经网络,适用于我在教程中找到的一个数据集。过去 48 小时我一直在研究代码,它运行得很好。但现在,我想测试其他数据集,但失败了。它只运行第一次迭代,返回高成本,然后成本变成 nan。

from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

def sigmoid(x): return 1 / (1 + np.exp(-x))

def softmax(x): return np.exp(x)/np.sum(np.exp(x))

def tanh(x): return np.tanh(x)

def relu(x): return np.maximum(x, 0)

def derivative_tanh(x): return 1 - np.power(np.tanh(x), 2)

def derivative_relu(x): return np.array(x>0, dtype=np.float32)

def initialize_parameters(layer_dims):
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

    return parameters

def forward_propagation(X, parameters, activation):
    forward_cache = {}
    L = len(parameters) // 2

    forward_cache['A0'] = X

    for l in range(1, L):
        forward_cache['Z' + str(l)] = parameters['W' + str(l)].dot(forward_cache['A' + str(l-1)]) + parameters['b' + str(l)]

        if activation == 'tanh':
            forward_cache['A' + str(l)] = tanh(forward_cache['Z' + str(l)])
        else:
            forward_cache['A' + str(l)] = relu(forward_cache['Z' + str(l)])


    forward_cache['Z' + str(L)] = parameters['W' + str(L)].dot(forward_cache['A' + str(L-1)]) + parameters['b' + str(L)]

    if forward_cache['Z' + str(L)].shape[0] == 1:
        forward_cache['A' + str(L)] = sigmoid(forward_cache['Z' + str(L)])
    else :
        forward_cache['A' + str(L)] = softmax(forward_cache['Z' + str(L)])

    return forward_cache['A' + str(L)], forward_cache

def compute_cost(AL, Y):
    m = Y.shape[0]

    if size_of_output == 1:
        cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
    else:
        cost = -(1./m) * np.sum(Y * np.log(AL))

    cost = np.squeeze(cost)

    return cost

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_propagation(AL, Y, parameters, forward_cache, activation):
    grads = {}
    L = len(parameters)//2
    m = AL.shape[1]

    grads["dZ" + str(L)] = AL - Y
    grads["dW" + str(L)] = 1./m * np.dot(grads["dZ" + str(L)],forward_cache['A' + str(L-1)].T)
    grads["db" + str(L)] = 1./m * np.sum(grads["dZ" + str(L)], axis = 1, keepdims = True)

    for l in reversed(range(1, L)):
        if activation == 'tanh':
            grads["dZ" + str(l)] = np.dot(parameters['W' + str(l+1)].T,grads["dZ" + str(l+1)])*derivative_tanh(forward_cache['A' + str(l)])
        else:
            grads["dZ" + str(l)] = np.dot(parameters['W' + str(l+1)].T,grads["dZ" + str(l+1)])*derivative_relu(forward_cache['A' + str(l)])

        grads["dW" + str(l)] = 1./m * np.dot(grads["dZ" + str(l)],forward_cache['A' + str(l-1)].T)
        grads["db" + str(l)] = 1./m * np.sum(grads["dZ" + str(l)], axis = 1, keepdims = True)

    return grads

def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]

    return parameters

def predict(X, y, parameters, activation):
    m = X.shape[1]
    y_pred, caches = forward_propagation(X, parameters, activation)

    if size_of_output == 1:
        y_pred = np.array(y_pred > 0.5, dtype = 'float')
    else:
        y = np.argmax(y, 0)
        y_pred = np.argmax(y_pred, 0)

    return np.round(np.sum((y_pred == y)/m), 2)

def model(X, Y, layers_dims, learning_rate = 0.03, activation = 'relu', num_iterations = 3000):#lr was 0.009

    np.random.seed(1)
    costs = []

    parameters = initialize_parameters(layers_dims)

    for i in range(0, num_iterations):
        AL, forward_cache = forward_propagation(X, parameters, activation)
        cost = compute_cost(AL, Y)
        grads = backward_propagation(AL, Y, parameters, forward_cache, activation)
        parameters = update_parameters(parameters, grads, learning_rate)

        if i % (num_iterations/10) == 0:
            print("\niter:{} \t cost: {} \t train_acc:{} \t test_acc:{}".format(i, np.round(cost, 2), predict(X_train, Y_train, parameters, activation), predict(X_test, Y_test, parameters, activation)))

        if i % 10 == 0:
            print("==", end = '')


    return parameters


numbers = datasets.load_digits()
X, Y = numbers.data, numbers.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)

X_train = X_train.T
X_test = X_test.T

size_of_output = one_hot(Y_train).shape[0]


print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
# (64, 1437)
# (1437,)
# (64, 360)
# (360,)

layer_dims = [X_train.shape[0], 20, 7, 5, size_of_output] # 20,7,5 are random hidden layers

parameters = model(X_train, Y_train, layer_dims, learning_rate = 0.0075, activation = 'relu', num_iterations = 2500)

我尝试转置矩阵,用 one_hot 更改 Y,尝试谷歌,尝试查看其他教程,其他神经网络,我什至尝试过 chatgpt 但什么也没有。

numpy scikit-learn neural-network dataset artificial-intelligence
1个回答
0
投票

检查您是否进行了正确的预处理(例如标准化,...)。

© www.soinside.com 2019 - 2024. All rights reserved.