我有一个神经网络,适用于我在教程中找到的一个数据集。过去 48 小时我一直在研究代码,它运行得很好。但现在,我想测试其他数据集,但失败了。它只运行第一次迭代,返回高成本,然后成本变成 nan。
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
def sigmoid(x): return 1 / (1 + np.exp(-x))
def softmax(x): return np.exp(x)/np.sum(np.exp(x))
def tanh(x): return np.tanh(x)
def relu(x): return np.maximum(x, 0)
def derivative_tanh(x): return 1 - np.power(np.tanh(x), 2)
def derivative_relu(x): return np.array(x>0, dtype=np.float32)
def initialize_parameters(layer_dims):
parameters = {}
L = len(layer_dims)
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
return parameters
def forward_propagation(X, parameters, activation):
forward_cache = {}
L = len(parameters) // 2
forward_cache['A0'] = X
for l in range(1, L):
forward_cache['Z' + str(l)] = parameters['W' + str(l)].dot(forward_cache['A' + str(l-1)]) + parameters['b' + str(l)]
if activation == 'tanh':
forward_cache['A' + str(l)] = tanh(forward_cache['Z' + str(l)])
else:
forward_cache['A' + str(l)] = relu(forward_cache['Z' + str(l)])
forward_cache['Z' + str(L)] = parameters['W' + str(L)].dot(forward_cache['A' + str(L-1)]) + parameters['b' + str(L)]
if forward_cache['Z' + str(L)].shape[0] == 1:
forward_cache['A' + str(L)] = sigmoid(forward_cache['Z' + str(L)])
else :
forward_cache['A' + str(L)] = softmax(forward_cache['Z' + str(L)])
return forward_cache['A' + str(L)], forward_cache
def compute_cost(AL, Y):
m = Y.shape[0]
if size_of_output == 1:
cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T))
else:
cost = -(1./m) * np.sum(Y * np.log(AL))
cost = np.squeeze(cost)
return cost
def one_hot(Y):
one_hot_Y = np.zeros((Y.size, Y.max() + 1))
one_hot_Y[np.arange(Y.size), Y] = 1
one_hot_Y = one_hot_Y.T
return one_hot_Y
def backward_propagation(AL, Y, parameters, forward_cache, activation):
grads = {}
L = len(parameters)//2
m = AL.shape[1]
grads["dZ" + str(L)] = AL - Y
grads["dW" + str(L)] = 1./m * np.dot(grads["dZ" + str(L)],forward_cache['A' + str(L-1)].T)
grads["db" + str(L)] = 1./m * np.sum(grads["dZ" + str(L)], axis = 1, keepdims = True)
for l in reversed(range(1, L)):
if activation == 'tanh':
grads["dZ" + str(l)] = np.dot(parameters['W' + str(l+1)].T,grads["dZ" + str(l+1)])*derivative_tanh(forward_cache['A' + str(l)])
else:
grads["dZ" + str(l)] = np.dot(parameters['W' + str(l+1)].T,grads["dZ" + str(l+1)])*derivative_relu(forward_cache['A' + str(l)])
grads["dW" + str(l)] = 1./m * np.dot(grads["dZ" + str(l)],forward_cache['A' + str(l-1)].T)
grads["db" + str(l)] = 1./m * np.sum(grads["dZ" + str(l)], axis = 1, keepdims = True)
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def predict(X, y, parameters, activation):
m = X.shape[1]
y_pred, caches = forward_propagation(X, parameters, activation)
if size_of_output == 1:
y_pred = np.array(y_pred > 0.5, dtype = 'float')
else:
y = np.argmax(y, 0)
y_pred = np.argmax(y_pred, 0)
return np.round(np.sum((y_pred == y)/m), 2)
def model(X, Y, layers_dims, learning_rate = 0.03, activation = 'relu', num_iterations = 3000):#lr was 0.009
np.random.seed(1)
costs = []
parameters = initialize_parameters(layers_dims)
for i in range(0, num_iterations):
AL, forward_cache = forward_propagation(X, parameters, activation)
cost = compute_cost(AL, Y)
grads = backward_propagation(AL, Y, parameters, forward_cache, activation)
parameters = update_parameters(parameters, grads, learning_rate)
if i % (num_iterations/10) == 0:
print("\niter:{} \t cost: {} \t train_acc:{} \t test_acc:{}".format(i, np.round(cost, 2), predict(X_train, Y_train, parameters, activation), predict(X_test, Y_test, parameters, activation)))
if i % 10 == 0:
print("==", end = '')
return parameters
numbers = datasets.load_digits()
X, Y = numbers.data, numbers.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1234)
X_train = X_train.T
X_test = X_test.T
size_of_output = one_hot(Y_train).shape[0]
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
# (64, 1437)
# (1437,)
# (64, 360)
# (360,)
layer_dims = [X_train.shape[0], 20, 7, 5, size_of_output] # 20,7,5 are random hidden layers
parameters = model(X_train, Y_train, layer_dims, learning_rate = 0.0075, activation = 'relu', num_iterations = 2500)
我尝试转置矩阵,用 one_hot 更改 Y,尝试谷歌,尝试查看其他教程,其他神经网络,我什至尝试过 chatgpt 但什么也没有。
检查您是否进行了正确的预处理(例如标准化,...)。