回调函数modelcheckpoint导致keras错误

Question

当我使用回调函数modelcheckpoint时，我似乎得到了这个错误。

我从一个github问题中读到解决方案将使用model.get_weight，但我隐含地只存储它，因为我只存储具有最佳权重的那个。

Keras似乎只使用h5来保存权重，这让我觉得有没有其他方法可以使用eras API存储它们，如果是这样的话怎么样？如果没有，我该如何存储？

举例来重现这个问题：

#!/usr/bin/python


import glob, os
import sys
from os import listdir
from os.path import isfile, join
import numpy as np
import warnings
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from keras.utils import np_utils
from keras import metrics
import keras
from keras import backend as K
from keras.models import Sequential
from keras.optimizers import SGD, Adam
from keras.layers.core import Dense, Activation, Lambda, Reshape,Flatten
from keras.layers import Conv1D,Conv2D,MaxPooling2D, MaxPooling1D, Reshape
#from keras.utils.visualize_util import plot
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.merge import Concatenate, Add
import h5py
import random
import tensorflow as tf
import math
from keras.callbacks import CSVLogger
from keras.callbacks import ModelCheckpoint


if len(sys.argv) < 5:
    print "Missing Arguments!"
    print "python keras_convolutional_feature_extraction.py <workspace> <totale_frames> <fbank-dim> <window-height> <batch_size>"
    print "Example:"
    print "python keras_convolutional_feature_extraction.py deltas 15 40 5 100"
    sys.exit()


total_frames = int(sys.argv[2])
total_frames_with_deltas = total_frames*3
dim = int(sys.argv[3])
window_height = int(sys.argv[4])
inserted_batch_size = int(sys.argv[5])
stride = 1
splits = ((dim - window_height)+1)/stride

#input_train_data = "/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_train_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(inserted_batch_size)+"_fws_input"
#output_train_data ="/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_train_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(inserted_batch_size)+"_fws_output"
#input_test_data = "/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_test_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(1)+"_fws_input"
#output_test_data = "/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_test_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(1)+"_fws_output"

#train_files =[f for f in listdir(input_train_data) if isfile(join(input_train_data, f))]
#test_files =[f for f in listdir(input_test_data) if isfile(join(input_test_data, f))]

#print len(train_files)
np.random.seed(100)
print "hallo"
def train_generator():
    while True:
#        input = random.choice(train_files)
#        h5f = h5py.File(input_train_data+'/'+input, 'r')
#        train_input = h5f['train_input'][:]
#        train_output = h5f['train_output'][:]
#        h5f.close()
        train_input = np.random.randint(100,size=((inserted_batch_size,splits*total_frames_with_deltas,window_height,3)))
        train_list_list = []
        train_input = train_input.reshape((inserted_batch_size,splits*total_frames_with_deltas,window_height,3))
        train_input_list = np.split(train_input,splits*total_frames_with_deltas,axis=1)
        for i in range(len(train_input_list)):
            train_input_list[i] = train_input_list[i].reshape(inserted_batch_size,window_height,3)


        #for i in range(len(train_input_list)):
        #    train_input_list[i] = train_input_list[i].reshape(inserted_batch_size,33,window_height,1,3)

        train_output = np.random.randint(5, size = (1,total_frames,5))
        middle = int(math.ceil(total_frames/2))

        train_output = train_output[:,middle:middle+1,:].reshape((inserted_batch_size,1,5))
        #print train_output.shape
        #print len(train_input_list)
        #print train_input_list[0].shape
        yield (train_input_list, train_output)
print "hallo"
def test_generator():
    while True:
#        input = random.choice(test_files)
#        h5f = h5py.File(input_test_data+'/'+input, 'r')
#        test_input = h5f['test_input'][:]
#        test_output = h5f['test_output'][:]
#        h5f.close()
        test_input = np.random.randint(100,size=((inserted_batch_size,splits*total_frames_with_deltas,window_height,3)))
        test_input = test_input.reshape((inserted_batch_size,splits*total_frames_with_deltas,window_height,3))
        test_input_list = np.split(test_input,splits*total_frames_with_deltas,axis=1)
        #test_input_list = np.split(test_input,45,axis=3)

        for i in range(len(test_input_list)):
            test_input_list[i] = test_input_list[i].reshape(inserted_batch_size,window_height,3)

        #for i in range(len(test_input_list)):
        #    test_input_list[i] = test_input_list[i].reshape(inserted_batch_size,33,window_height,1,3)

        test_output = np.random.randint(5, size = (1,total_frames,5))

        middle = int(math.ceil(total_frames/2))

        test_output = test_output[:,middle:middle+1,:].reshape((inserted_batch_size,1,5))

        yield (test_input_list, test_output)
print "hallo"

def fws():
    #print "Inside"
    #   Params:
    #   batch ,  lr, decay , momentum, epochs
    #
    #Input shape: (batch_size,40,45,3)
    #output shape: (1,15,50)
    # number of unit in conv_feature_map = splitd
    next(train_generator())
    model_output = []
    list_of_input = [Input(shape=(8,3)) for i in range(splits*total_frames_with_deltas)]
    output = []

    #Conv
    skip = total_frames_with_deltas
    for steps in range(total_frames_with_deltas):
        conv = Conv1D(filters = 100, kernel_size = 8)
        column = 0
        for  _ in range(splits):
            #print "column " + str(column) + "steps: " + str(steps)
            output.append(conv(list_of_input[(column*skip)+steps]))
            column = column + 1

    #print len(output)
    #print splits*total_frames_with_deltas


    conv = []
    for section in range(splits):
        column = 0
        skip = splits
        temp = []
        for _ in range(total_frames_with_deltas):
            temp.append(output[((column*skip)+section)])
            column = column + 1
        conv.append(Add()(temp))
        #print len(conv)



    output_conc = Concatenate()(conv)
    #print output_conc.get_shape
    output_conv = Reshape((splits, -1))(output_conc)
    #print output_conv.get_shape

    #Pool
    pooled = MaxPooling1D(pool_size = 6, strides = 2)(output_conv)
    reshape = Reshape((1,-1))(pooled)

    #Fc
    dense1 = Dense(units = 1024, activation = 'relu',    name = "dense_1")(reshape)
    #dense2 = Dense(units = 1024, activation = 'relu',    name = "dense_2")(dense1)
    dense3 = Dense(units = 1024, activation = 'relu',    name = "dense_3")(dense1)
    final = Dense(units = 5, activation = 'relu',    name = "final")(dense3)

    model = Model(inputs = list_of_input , outputs = final)
    sgd = SGD(lr=0.1, decay=1e-1, momentum=0.9, nesterov=True)
    model.compile(loss="categorical_crossentropy", optimizer=sgd , metrics = ['accuracy'])
    print "compiled"

    model_yaml = model.to_yaml()
    with open("model.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)

    print "Model saved!"

    log= CSVLogger('/home/carl/kaldi-trunk/dnn/experimental/yesno_cnn_50_training_total_frames_'+str(total_frames)+"_dim_"+str(dim)+"_window_height_"+str(window_height)+".csv")
    filepath='yesno_cnn_50_training_total_frames_'+str(total_frames)+"_dim_"+str(dim)+"_window_height_"+str(window_height)+"weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_weights_only=True, mode='max')


    print "log"
    #plot_model(model, to_file='model.png')
    print "Fit"
    hist_current = model.fit_generator(train_generator(),
                        steps_per_epoch=444,#len(train_files),
                        epochs = 10000,
                        verbose = 1,
                        validation_data = test_generator(),
                        validation_steps=44,#len(test_files),
                        pickle_safe = True,
                        workers = 4,
                        callbacks = [log,checkpoint])

fws()

执行脚本：python name_of_script.py yens 50 40 8 1

这给了我一个完整的追溯：

full traceback错误：

carl@ca-ThinkPad-T420s:~/Dropbox$ python mini.py yesno 50 40 8 1
Using TensorFlow backend.
Couldn't import dot_parser, loading of dot files will not be possible.
hallo
hallo
hallo
compiled
Model saved!
log
Fit
/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:2252: UserWarning: Expected no kwargs, you passed 1
kwargs passed to function are ignored with Tensorflow backend
  warnings.warn('\n'.join(msg))
Epoch 1/10000
2017-05-26 13:01:45.851125: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-26 13:01:45.851345: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-26 13:01:45.851392: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
443/444 [============================>.] - ETA: 4s - loss: 100.1266 - acc: 0.3138Epoch 00000: saving model to yesno_cnn_50_training_total_frames_50_dim_40_window_height_8weights-improvement-00-0.48.hdf5
Traceback (most recent call last):
  File "mini.py", line 205, in <module>

  File "mini.py", line 203, in fws

  File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1933, in fit_generator
    callbacks.on_epoch_end(epoch, epoch_logs)
  File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 77, in on_epoch_end
    callback.on_epoch_end(epoch, logs)
  File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 411, in on_epoch_end
    self.model.save_weights(filepath, overwrite=True)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2503, in save_weights
    save_weights_to_hdf5_group(f, self.layers)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2746, in save_weights_to_hdf5_group
    f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2684)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2642)
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/attrs.py", line 93, in __setitem__
    self.create(name, data=value, dtype=base.guess_dtype(value))
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/attrs.py", line 183, in create
    attr = h5a.create(self._id, self._e(tempname), htype, space)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2684)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2642)
  File "h5py/h5a.pyx", line 47, in h5py.h5a.create (/tmp/pip-4rPeHA-build/h5py/h5a.c:1904)
RuntimeError: Unable to create attribute (Object header message is too large)

Answer 1

如果你看一下Keras试图在layer_names属性下保存的数据量（在正在创建的输出HDF5文件中），你会发现它需要超过64kB。

np.asarray([layer.name.encode('utf8') for layer in model.layers]).nbytes
>> 77100

我引自https://support.hdfgroup.org/HDF5/faq/limits.html：

是否有对象标题限制以及它如何影响HDF5？

对象头有一个限制（在HDF5-1.8中），即64 KB。数据集的数据类型存储在对象标头中，因此可以对数据类型的大小进行限制。（见HDFFV-1089）

上面的代码（几乎完全）从traceback复制：

File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2746, in save_weights_to_hdf5_group
f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]

我正在使用numpy asarray方法快速获得数字，但h5py得到类似的数字（我猜），如果你想找到确切的数字，请参阅https://github.com/h5py/h5py/blob/master/h5py/_hl/attrs.py#L102。

无论如何，要么您需要实现自己的方法来保存/加载权重（或使用现有的变通方法），要么您需要为模型中的所有层提供一个非常短的名称:)，如下所示：

list_of_input = [Input(shape=(8,3), name=('i%x' % i)) for i in range(splits*total_frames_with_deltas)]
conv = Conv1D(filters = 100, kernel_size = 8, name='cv%x' % steps) 
conv.append(Add(name='add%x' % section)(temp))
output_conc = Concatenate(name='ct')(conv)
output_conv = Reshape((splits, -1), name='rs1')(output_conc)
pooled = MaxPooling1D(pool_size = 6, strides = 2, name='pl')(output_conv)
reshape = Reshape((1,-1), name='rs2')(pooled) 
dense1 = Dense(units = 1024, activation = 'relu', name = "d1")(reshape) 
dense2 = Dense(units
= 1024, activation = 'relu', name = "d2")(dense1) 
dense3 = Dense(units = 1024, activation = 'relu', name = "d3")(dense1) 
final = Dense(units = 5, activation = 'relu', name = "fl")(dense3)

您不能忘记为所有图层命名，因为图层名称转换成的（numpy）字符串数组在保存时使用了每个字符串中最长字符串的大小！

在按照上面的建议重命名图层（大约需要26kB）后，模型将成功保存。希望这个精心设计的答案有助于某人

更新：我刚刚向Keras做了一个公关，它应该修复问题而不实现任何自定义加载/保存方法，请参阅7508

Answer 2

一个简单的解决方案，尽管可能不是最优雅的，可能是用epochs = 1运行while循环。

在每个时代结束时获得权重以及准确性和损失
使用model.get_weight将权重保存到文件1
如果精度大于上一个纪元（即循环），则将权重存储到不同的文件（文件2）
再次运行循环从文件1加载权重
打破循环设置手动提前停止，以便在一定数量的循环中损失没有改善时断开

Answer 3

您可以将get_weights()与numpy.save一起使用。

这不是最好的解决方案，因为它会保存几个文件，但它确实有效。

问题是你不会在当前状态下保存“优化器”。但是你可以通过在加载后使用较小的学习率来解决这个问题。

使用numpy.save进行自定义回调：

def myCallback(epoch,logs):
    global storedLoss
    #do your comparisons here using the "logs" var.
    print(logs)


    if (logs['loss'] < storedLoss):

        storedLoss = logs['loss']
        for i in range(len(model.layers)):

            WandB = model.layers[i].get_weights()

            if len (WandB) > 0: #necessary because some layers have no weights

                np.save("W" + "-" + str(i), WandB[0],False) 
                np.save("B" + "-" + str(i), WandB[1],False)


    #remember that get and set weights use a list: [weights,biases]   
    #it may happen (not sure) that there is no bias, and thus you may have to check it (len(WandB)==1).

如果你使用的话，logs var会带来一个带有命名指标的字典，例如“loss”和“accuracy”。

您可以将回调中的损失存储在全局变量中，并比较每个损失是否比最后一个更好或更差。

在拟合时，使用lambda回调：

from keras.callbacks import LambdaCallback
model.fit(...,callbacks=[LambdaCallback(on_epoch_end=myCallback)])

在上面的例子中，我使用了LambdaCallback，它比on_epoch_end有更多的可能性。

要加载，请执行类似的循环：

#you have to create the model first and then set the layers
def loadModel(model):
    for i in range(len(model.layers)):
        WandBForCheck = model.layers[i].get_weights() 

        if len (WandBForCheck) > 0: #necessary because some layers have no weights
            W = np.load(Wfile + str(i))   
            B = np.load(Bfile + str(i))
            model.layers[i].set_weights([W,B])

Answer 4

请参阅https://github.com/fchollet/keras/issues/6766和https://github.com/farizrahman4u/keras-contrib/pull/90的后续行动。

我看到了YAML，根本原因可能是你有很多输入。一些具有多个维度的输入比许多输入更受欢迎，特别是如果您可以使用扫描和批处理操作来有效地执行所有操作。

现在，完全忽略这一点，如果它有太多的东西可以有效地保存为JSON，那么你可以保存和加载你的模型：

你可以通过save_weights_only=True。这不会保存优化器权重，因此不是一个很好的解决方案。

只需将PR组合在一起即可保存模型权重和优化器权重，但不能进行配置。当您要加载时，首先实例化并编译模型，就像您要训练它时一样，然后使用load_all_weights将模型和优化器权重加载到该模型中。我会尽快合并它，以便您可以从主分支中使用它。

你可以使用这样的东西：

from keras.callbacks import LambdaCallback
from keras_contrib.utils.save_load_utils import save_all_weights, load_all_weights
# do some stuff to create and compile model
# use `save_all_weights` as a callback to checkpoint your model and optimizer weights
model.fit(..., callbacks=[LambdaCallback(on_epoch_end=lambda epoch, logs: save_all_weights(model, "checkpoint-{:05d}.h5".format(epoch))])
# use `load_all_weights` to load model and optimizer weights into an existing model
# if not compiled (no `model.optimizer`), this will just load model weights
load_all_weights(model, 'checkpoint-1337.h5')

所以我不认可这个模型，但如果你想让它保存并加载，那么这应该适合你。

作为旁注，如果你想以不同的格式保存权重，这样的东西就可以了。

pickle.dump([K.get_value(w) for w in model.weights], open( "save.p", "wb" ) )

干杯

Answer 5

您的模型体系结构必须太大而无法保存。

使用get_weights和set_weights来保存和加载模型，相应地。不要使用回调模型检查点。一旦训练结束，用泡菜保存它的重量。

看看这个链接：Unable to save DataFrame to HDF5 ("object header message is too large")

回调函数modelcheckpoint导致keras错误

问题描述投票：10回答：5

5个回答

最新问题

回调函数modelcheckpoint导致keras错误

问题描述 投票：10回答：5

5个回答

最新问题

问题描述投票：10回答：5