我在尝试使用 Theano 训练一个相当标准的 MLP 模型时遇到了一些困难。我的模型代码如下
class Layer(object):
def __init__(self, inputs, n_in, n_out, activation=T.nnet.softmax):
def weights(shape):
return np.array(np.random.uniform(size=shape), dtype='float64')
def biases(size):
return np.zeros((size), dtype='float64')
self.W = theano.shared(value=weights((n_in, n_out)), name='weights', borrow=True)
self.b = theano.shared(value=biases(n_out), name='biases', borrow=True)
self.output = activation(T.dot(inputs, self.W) + self.b)
self.pred = T.argmax(self.output, axis=1)
self.params = [self.W, self.b]
class MLP(object):
def __init__(self, inputs, n_in, n_hidden, n_out):
""" for now lets go with one hidden layer"""
self._hidden = Layer(inputs, n_in, n_hidden, activation=T.tanh)
self._output = Layer(self._hidden.output, n_hidden, n_out) # softmax by default
def loss(self, one_hot):
return T.mean(T.sqr(one_hot - self._output.output)
def accuracy(self, y):
return T.mean(T.eq(self._output.pred, y))
def updates(self, loss, rate=0.01):
updates = []
updates.append((self._hidden.W, self._hidden.W - rate * T.grad(cost=loss, wrt=self._hidden.W)))
updates.append((self._hidden.b, self._hidden.b - rate * T.grad(cost=loss, wrt=self._hidden.b)))
updates.append((self._output.W, self._output.W - rate * T.grad(cost=loss, wrt=self._output.W)))
updates.append((self._output.b, self._output.b - rate * T.grad(cost=loss, wrt=self._output.b)))
return updates
然后我尝试像这样训练它
x = T.matrix('x', dtype='float64')
y = T.vector('y', dtype='int32')
# basic logistic model
# model = Layer(x, 784, 10, activation=T.nnet.softmax)
# basic multi-layer perceptron
model = MLP(x, 784, 128, 10)
labels = T.extra_ops.to_one_hot(y, 10)
# loss function
#loss = T.mean(T.sqr(labels - model.output))
loss = model.loss(labels)
# average number of correct predictions over a batch
#accuracy = T.mean(T.eq(model.pred, y))
accuracy = model.accuracy(y)
# updates
#rate = 0.05
#g_W = T.grad(cost=loss, wrt=model.W)
#g_b = T.grad(cost=loss, wrt=model.b)
#updates = [(model.W, model.W - rate * g_W),
# (model.b, model.b - rate * g_b)]
updates = model.updates(loss, rate=0.3)
# batch index
index = T.scalar('batch index', dtype='int32')
size = T.scalar('batch size', dtype='int32')
train = theano.function([index, size],
[loss, accuracy],
updates=updates,
givens={x: train_set[0][index * size: (index + 1) * size],
y: train_set[1][index * size: (index + 1) * size]})
valid = theano.function([index, size],
[loss, accuracy],
givens={x: valid_set[0][index * size: (index + 1) * size],
y: valid_set[1][index * size: (index + 1) * size]})
test = theano.function([index, size],
[accuracy],
givens={x: test_set[0][index * size: (index + 1) * size],
y: test_set[1][index * size: (index + 1) * size]})
n_epochs = 10
batch_size = 500
# number of items in training dataset / batch size
batches_in_epoch = datasets[0][0].shape[0] // batch_size
losses = np.empty(0)
errors = np.empty(0)
for epoch in range(1, n_epochs + 1):
epoch_losses = np.empty(0)
epoch_errors = np.empty(0)
for batch_n in range(batches_in_epoch):
l, e = train(batch_n, batch_size)
epoch_losses = np.append(epoch_losses, l)
epoch_errors = np.append(epoch_errors, e)
print('[%s]' % time.ctime(),
'epoch: ', epoch,
'batch: ', batch_n,
'loss: ', np.round(l, 4),
'accuracy: ', np.round(e, 4))
# shuffle train set every epoch
shuffle = np.arange(datasets[0][1].shape[0])
np.random.shuffle(shuffle)
train_set[0] = train_set[0][shuffle]
train_set[1] = train_set[1][shuffle]
losses = np.concatenate([losses, epoch_losses])
errors = np.concatenate([errors, epoch_errors])
valid_l, valid_e = valid(0, datasets[1][0].shape[0])
print('[%s]' % time.ctime(), 'epoch: ', epoch, 'validation loss: ', valid_l, 'validation accuracy: ', valid_e)
acc = test(0, datasets[2][0].shape[0])
print()
print('Final accuracy: ', np.round(acc, 4)[0])
现在,如果你看看评论,我用基本的逻辑回归模型尝试了它,它起作用了,我得到了大约 80% 的准确率。但当我用我的 MLP 模型替换它时,它不起作用。它不会收敛到任何东西,我得到 10% 准确度的随机猜测。我做错了什么?我使用的数据是 MNIST 数据集,按照 Theano 教程的方式加载到共享变量中。
问题似乎出在权重初始化上。您在张量流实现中是如何做到这一点的?
我现在不太确定底层的数学原理,所以如果我错了,请纠正我,但我喜欢将其解释为如果所有权重均为正,则模型无法学习负特征。
您可以尝试将
low=-1, high=1
添加到初始化中(np.random.uniform
默认介于 0 和 1 之间)。
在我的测试中,这需要很长时间才能收敛(~100 epoch),但至少它做到了。
使用更智能的 glorot 初始化,如下所示:
def weights(shape):
return np.random.uniform(low=-np.sqrt(6. / sum(shape)),
high=np.sqrt(6. / sum(shape)),
size=shape)
使训练速度更快。在将其添加到您的代码中 5 个时期后,我获得了大约 90% 的验证准确率。
这也是 theano MLP 示例中初始化权重的方式。