我正在使用 LSTM VAE(变分自动编码器)解决时间序列问题,我已经构建了如下的 VAE 模型
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
class VAE:
def __init__(self,
hidden_layer_units,
hidden_layer_leakyrelu_alphas,
hidden_layer_dropout_rates,
batch_size,
time_steps,
num_features,
is_stateful_learning):
self.hidden_layer_units = hidden_layer_units
self.hidden_layer_leakyrelu_alphas = hidden_layer_leakyrelu_alphas
self.hidden_layer_dropout_rates = hidden_layer_dropout_rates
self.encoder_num_layers = 0
self.latent_space_dim = 0
vae_total_layers = len(hidden_layer_units)
if 0 < vae_total_layers:
self.encoder_num_layers = int((vae_total_layers - 1) / 2)
self.latent_space_dim = self.hidden_layer_units[self.encoder_num_layers]
self.batch_size = batch_size
self.time_steps = time_steps
self.num_features = num_features
self.is_stateful_learning = is_stateful_learning
self.encoder = None
self.decoder = None
self.model = None
self.model_input = None
self.model_output = None
self.mu = None
self.log_variance = None
self.kulback_coef = 0.0001
self._build()
def summary(self):
self.encoder.summary()
self.decoder.summary()
self.model.summary()
def compile(self, learning_rate=0.001):
optimizer = Adam(learning_rate=learning_rate)
self.model.compile(optimizer=optimizer,
loss=self._calculate_combined_loss,
metrics=[self._calculate_reconstruction_loss, self._calculate_kl_loss])
def _build(self):
self._build_encoder()
self._build_decoder()
self._build_autoencoder()
def _build_encoder(self):
encoder_input = self._add_encoder_input()
lstm_layers = self._add_encoder_lstm_layers(encoder_input)
bottleneck = self._add_bottleneck(lstm_layers)
self.model_input = encoder_input
self.encoder = Model(encoder_input, bottleneck, name="encoder")
def _build_decoder(self):
decoder_input = self._add_decoder_input()
repeater_layer = self._add_repeater_layer(decoder_input)
lstm_layer = self._add_decoder_lstm_layer(repeater_layer)
decoder_output = self._add_decoder_output(lstm_layer)
self.decoder = Model(decoder_input, decoder_output, name="decoder")
def _build_autoencoder(self):
model_input = self.model_input
encoder_output = self.encoder(model_input)
model_output = self.decoder(encoder_output)
self.model_output = model_output
self.model = Model(model_input, model_output, name="autoencoder")
def _add_encoder_input(self):
if self.is_stateful_learning:
x = Input(batch_shape=(self.batch_size, self.time_steps, self.num_features), name="encoder_input")
else:
x = Input(shape=(self.time_steps, self.num_features), name="encoder_input")
return x
def _add_encoder_lstm_layers(self, encoder_input):
""" Create all lstm layers in encoder."""
x = encoder_input
for layer_index, units in enumerate(self.hidden_layer_units[:self.encoder_num_layers]):
lstm_params = {}
if layer_index < self.encoder_num_layers - 1:
lstm_params["return_sequences"] = True
if self.is_stateful_learning:
lstm_params["stateful"] = True
x = LSTM(units=units, **lstm_params)(x)
x = LeakyReLU(alpha=self.hidden_layer_leakyrelu_alphas[layer_index])(x)
x = Dropout(rate=self.hidden_layer_dropout_rates[layer_index])(x)
return x
def _add_bottleneck(self, x):
""" add bottleneck with Guassian sampling (Dense layer)."""
self.mu = Dense(self.latent_space_dim, name="mu")(x)
self.log_variance = Dense(self.latent_space_dim, name="log_variance")(x)
x = Lambda(self.sample_point_from_normal_distribution, name="encoder_output")([self.mu, self.log_variance])
return x
def sample_point_from_normal_distribution(self, args):
mu, log_variance = args
epsilon = K.random_normal(shape=K.shape(mu), mean=0., stddev=1.)
sampled_point = mu + K.exp(log_variance / 2) * epsilon
return sampled_point
def _add_decoder_input(self):
if self.is_stateful_learning:
x = Input(batch_shape=(self.batch_size, self.latent_space_dim), name="decoder_input")
else:
x = Input(shape=(self.latent_space_dim), name="decoder_input")
return x
def _add_repeater_layer(self, decoder_input):
return RepeatVector(self.time_steps)(decoder_input)
def _add_decoder_lstm_layer(self, repeater_layer):
x = repeater_layer
for layer_index, units in enumerate(self.hidden_layer_units[self.encoder_num_layers + 1:]):
lstm_params = {}
if self.is_stateful_learning:
# stateful build
lstm_params = {'stateful': True, 'return_sequences': True}
else:
lstm_params["return_sequences"] = True
layer_no = layer_index + self.encoder_num_layers + 1
x = LSTM(units=units, **lstm_params)(x)
x = LeakyReLU(alpha=self.hidden_layer_leakyrelu_alphas[layer_no])(x)
x = Dropout(rate=self.hidden_layer_dropout_rates[layer_no])(x)
return x
def _add_decoder_output(self, lstm_layer):
return TimeDistributed(Dense(1))(lstm_layer)
def _calculate_combined_loss(self, y_target, y_predicted):
reconstruction_loss = self._calculate_reconstruction_loss(y_target, y_predicted)
kl_loss = self._calculate_kl_loss(y_target, y_predicted)
combined_loss = reconstruction_loss + (self.kulback_coef * kl_loss)
return combined_loss
def _calculate_reconstruction_loss(self, y_target, y_predicted):
error = y_target - y_predicted
reconstruction_loss = K.mean(K.square(error), axis=1)
return reconstruction_loss
def _calculate_kl_loss(self, y_target, y_predicted):
kl_loss = -0.5 * K.sum(1 + self.log_variance - K.square(self.mu) - K.exp(self.log_variance), axis=1)
return kl_loss
# Build Variational AutoEncoder(VAE) LSTM Model:
def build_lstm_neural_network(lstm_layer_units=[], leakyrelu_layer_alphas=[], dropout_layer_rates=[],
number_of_sequences=32, time_steps=32, data_dim=1, is_stateful_learning=False):
vae = VAE(
hidden_layer_units=lstm_layer_units,
hidden_layer_leakyrelu_alphas=leakyrelu_layer_alphas,
hidden_layer_dropout_rates=dropout_layer_rates,
batch_size=number_of_sequences,
time_steps=time_steps,
num_features=data_dim,
is_stateful_learning=is_stateful_learning
)
vae.compile(learning_rate)
vae.summary()
return vae.model
模型训练块如下所示
# configuration
nn_lstm_layer_units = [160, 3, 160]
nn_leakyrelu_layer_alphas = [0.0, 0.0, 0.0]
nn_dropout_layer_rates = [0.3, 0.0, 0.3]
batch_size = 96
win_length = 64
num_features = 6 # You can use single variate Timeseries data as well, num_features = 1
epochs = 782
learning_rate = 0.0001
want_stateful_learning = True
# Build LSTM VAE model
model = build_lstm_neural_network(nn_lstm_layer_units, nn_leakyrelu_layer_alphas, nn_dropout_layer_rates, batch_size,
win_length, num_features, want_stateful_learning)
TIME_STEPS = win_length
# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
output = []
for i in range(len(values) - time_steps + 1):
output.append(values[i: (i + time_steps)])
return np.stack(output)
x_train = create_sequences(x_train)
x_val = create_sequences(x_val)
callbacks = []
unfit_train_record_count = 0
unfit_val_record_count = 0
if want_stateful_learning:
# stateful learning
# adjust train data size(should be in multiples of batch size)
unfit_train_record_count = len(x_train) % batch_size
unfit_val_record_count = len(x_val) % batch_size
# Reset states of the stateful model on epoch end
stateful_model_reset_states = LambdaCallback(on_epoch_end=lambda batch, logs: model.reset_states())
callbacks.append(stateful_model_reset_states)
early_stopping = EarlyStopping(monitor=monitor, patience=patience)
callbacks.append(early_stopping)
# Model traning
history = model.fit(x=x_train[unfit_train_record_count:], y=x_train[unfit_train_record_count:, :, [0]], validation_data=(x_val[unfit_val_record_count:], x_val[unfit_val_record_count:, :, [0]]), batch_size=batch_size, epochs=epochs, shuffle=False, callbacks=callbacks)
模型的无状态模式按预期工作,但有状态模式抛出如下错误-
1632/1632 [================================] - 预计到达时间:0 秒 - 损失:0.2447 - _calculate_reconstruction_loss:0.2447 - _calculate_kl_loss :0.0326 tensorflow.python.framework.errors_impl.InvalidArgumentError:发现 2 个根错误。 (0) 参数无效:您必须使用 dtype float 和 shape [96,3] 为占位符张量“decoder_input”提供值 [[{{节点解码器_输入}}]] [[指标/_calculate_reconstruction_loss/身份/_229]] (1) 参数无效:您必须使用 dtype float 和 shape [96,3] 为占位符张量“decoder_input”提供值 [[{{节点解码器_输入}}]]
使用环境如 Python-3.8.12, 张量流GPU:2.5, cudnn:8.2.1.32
我不清楚为什么有状态模型为训练数据运行 1 Epoch,但是一旦它开始处理验证数据,它就会抛出错误。
我解决了这个问题,通过改变损失计算逻辑,我没有在VAE类中定义计算重建和KL损失的函数,而是将损失计算部分移到VAE类之外,如下
# Build Variational AutoEncoder(VAE) LSTM Model:
def build_lstm_neural_network(lstm_layer_units=[], leakyrelu_layer_alphas=[], dropout_layer_rates=[],
number_of_sequences=32, time_steps=32, data_dim=1, is_stateful_learning=False):
vae = VAE(
hidden_layer_units=lstm_layer_units,
hidden_layer_leakyrelu_alphas=leakyrelu_layer_alphas,
hidden_layer_dropout_rates=dropout_layer_rates,
batch_size=number_of_sequences,
time_steps=time_steps,
num_features=data_dim,
is_stateful_learning=is_stateful_learning
)
# Add reconstruction loss
error = vae.model_input - vae.model_output
reconstruction_loss = K.mean(K.square(error))
vae.model.add_loss(reconstruction_loss)
vae.model.add_metric(reconstruction_loss, name='mse_loss', aggregation='mean')
# Add KL loss
kl_loss = kl_beta * K.mean(-0.5 * K.sum(1 + vae.log_variance - K.square(vae.mu) - K.exp(vae.log_variance), axis = 1), axis=0)
model.add_loss(kl_loss)
model.add_metric(kl_loss, name='kl_loss', aggregation='mean')
optimizer = Adam(learning_rate=vae.learning_rate, clipvalue=vae.clipvalue)
vae.model.compile(loss=None, optimizer=optimizer)
vae.summary()
return vae.model
我对不合适的数据集和损失函数有同样的经历,我尝试再次模拟,可能没有损失值变化,没有像 nan 那样的损失,验证时出错。 那可能没有值,没有匹配或不更新神经元,你可以使用 Tensorflow 2.x 更容易出现莫尔条纹。
这不是匹配验证:正在进行训练,但在验证时会出现错误。 (一种可能)
Epoch 1/100
2022-01-23 21:04:59.846791: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
1/1 [==============================] - ETA: 0s - loss: 3.1866 - accuracy: 0.0000e+00Traceback (most recent call last):
另一种可能是损失 Fn 不匹配:有可能它们没有更新神经元
Epoch 1/100
2022-01-23 21:08:23.330068: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8100
1/1 [==============================] - 3s 3s/step - loss: 13.7138 - accuracy: 0.2000 - val_loss: 8.2133 - val_accuracy: 0.0000e+00
Epoch 2/100
1/1 [==============================] - 0s 65ms/step - loss: 7.7745 - accuracy: 0.0000e+00 - val_loss: 8.0456 - val_accuracy: 0.0000e+00
首先非常感谢您分享代码。我是 TensorFlow 的新手,仍然遇到同样的问题,即 LSTM-VAE 训练一个周期,然后即使在更改了您提到的损失逻辑后,验证也失败了。另外,我正在尝试从上面的代码中汲取灵感来构建 LSTM-cVAE 模型。同样的问题也存在于此。 LSTM-cVAE 甚至没有开始训练,并抛出有关“decode_input”占位符的相同错误。 我正在使用 Tensorflow-2.12.0 和 python 3.9.15。在这篇文章之后,您是否对代码进行了任何其他更改?