我想使用 Keras 实现多视图变分自动编码器 (VAE) 模型。
代码:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import TensorBoard
# Dummy data for transcriptomics
transcriptomics_df = pd.DataFrame(np.random.rand(10, 100), index=[f'gene_{i}' for i in range(1, 11)],
columns=[f'sample_{j}' for j in range(1, 101)])
# Dummy data for epigenomics
epigenomics_df = pd.DataFrame(np.random.rand(8, 100), index=[f'methyl_{i}' for i in range(1, 9)],
columns=[f'sample_{j}' for j in range(1, 101)])
# Dummy data for proteomics
proteomics_df = pd.DataFrame(np.random.rand(5, 100), index=[f'protein_{i}' for i in range(1, 6)],
columns=[f'sample_{j}' for j in range(1, 101)])
# Dummy labels for survival and immune response
survival_labels = pd.Series(np.random.choice([0, 1], size=100), name='Overall_Survival',
index=[f'sample_{j}' for j in range(1, 101)])
immune_labels = pd.Series(np.random.choice([0, 1], size=100), name='Immune_Response',
index=[f'sample_{j}' for j in range(1, 101)])
# Dummy data for mRNA deconvolution
mrna_deconv = pd.DataFrame(np.random.rand(5, 10), index=[f'mrna_cell_type_{i}' for i in range(1, 6)],
columns=[f'sample_{j}' for j in range(1, 11)])
# Dummy data for methylation deconvolution
meth_deconv = pd.DataFrame(np.random.rand(5, 8), index=[f'meth_cell_type_{i}' for i in range(1, 6)],
columns=[f'sample_{j}' for j in range(1, 9)])
common_index = set(transcriptomics_df.columns).intersection(epigenomics_df.columns, proteomics_df.columns,
survival_labels.index, set(immune_labels.index),
mrna_deconv.columns, meth_deconv.columns)
transcriptomics_df = transcriptomics_df.loc[:, transcriptomics_df.columns.isin(common_index)]
epigenomics_df = epigenomics_df.loc[:, epigenomics_df.columns.isin(common_index)]
proteomics_df = proteomics_df.loc[:, proteomics_df.columns.isin(common_index)]
survival_labels = survival_labels.loc[survival_labels.index.isin(common_index)]
immune_labels = immune_labels.loc[immune_labels.index.isin(common_index)]
mrna_deconv = mrna_deconv.loc[:, mrna_deconv.columns.isin(common_index)]
meth_deconv = meth_deconv.loc[:, meth_deconv.columns.isin(common_index)]
transcriptomics_deconv_df = pd.concat([transcriptomics_df, mrna_deconv], axis=0)
epigenomics_deconv_df = pd.concat([epigenomics_df, meth_deconv], axis=0)
# Create a dictionary to hold your dataframes
omics_data_dict = {
"transcriptomics": transcriptomics_deconv_df.T,
"epigenomics": epigenomics_deconv_df.T,
"proteomics": proteomics_df.T,
}
# Multivariate
y = pd.concat([survival_labels, immune_labels], axis=1)
# Concatenate all views for each dataset
X = {key: omics_data_dict[key] for key in omics_data_dict.keys()}
# Split the data into train, test, and validation sets
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15
train_data_dict, val_data_dict, test_data_dict = {}, {}, {}
for key, data in X.items():
X_train, X_temp, y_train, y_temp = train_test_split(data, y, test_size=1 - train_ratio, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_ratio/(test_ratio + val_ratio), random_state=42)
train_data_dict[key] = (X_train, y_train)
test_data_dict[key] = (X_test, y_test)
val_data_dict[key] = (X_val, y_val)
# Concatenate all labels for each dataset
train_labels = {key: np.concatenate([train_data_dict[key][1] for key in train_data_dict.keys()]) for key in train_data_dict.keys()}
val_labels = {key: np.concatenate([val_data_dict[key][1] for key in val_data_dict.keys()]) for key in val_data_dict.keys()}
test_labels = {key: np.concatenate([test_data_dict[key][1] for key in test_data_dict.keys()]) for key in test_data_dict.keys()}
def create_vae(input_dim, latent_dim, learning_rate, num_classes, dropout_rate):
# Encoder
encoder_inputs = layers.Input(shape=(input_dim,), name='encoder_input')
y_input = layers.Input(shape=(num_classes,), name='class_input')
z = layers.Dense(latent_dim, activation='relu')(encoder_inputs)
z = layers.Dropout(dropout_rate)(z)
# Concatenate the encoded output with class labels
z_with_class = layers.Concatenate(name='concat_layer')([z, y_input])
# Decoder
decoder_inputs = layers.Input(shape=(latent_dim + num_classes,), name='decoder_input')
x_decoded = layers.Dense(input_dim, activation='sigmoid')(decoder_inputs)
# Classification layer
classification_layer = layers.Dense(num_classes, activation='softmax', name='classification')(z_with_class)
# Full model
full_model = models.Model([encoder_inputs, y_input, decoder_inputs], [x_decoded, classification_layer], name='full_model')
def custom_loss(y_true, y_pred):
reconstruction_loss = tf.keras.losses.binary_crossentropy(y_true[0], y_pred[0])
classification_loss = tf.keras.losses.sparse_categorical_crossentropy(y_true[1], y_pred[1])
return reconstruction_loss + classification_loss
full_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss=custom_loss, metrics=['accuracy'])
return full_model
# Arbitrary parameters
best_latent_dim = 2
best_learning_rate = 0.5
best_dropout_rate = 0.5
best_epochs = 2
best_batch_size = 2
best_num_classes = len(np.unique(train_labels))
# Create VAE models for each omics dataset
vae_models = {}
for key, (train_data, _) in train_data_dict.items():
input_dim = train_data.T.shape[1]
vae_models[key] = create_vae(input_dim=input_dim, latent_dim=best_latent_dim,
learning_rate=best_learning_rate,
num_classes=best_num_classes,
dropout_rate=best_dropout_rate)
# Train VAE models
for key in train_data_dict.keys():
# Separate the inputs
encoder_inputs = train_data_dict[key][0]
y_input = train_data_dict[key][1] # Use the key to index the correct labels
decoder_inputs = train_data_dict[key][0] # Use the same data for decoder inputs as encoder inputs
# Ensure data shapes are consistent
print(f"{key}: x shape={encoder_inputs.shape}, y shape={y_input.shape}")
# Verify key matching between train_data_dict and vae_models
assert key in vae_models, f"Key {key} not found in vae_models"
# Fit the VAE model
vae_models[key].fit(
x=[encoder_inputs, y_input, decoder_inputs],
y=[encoder_inputs, y_input], # Adjust labels accordingly
epochs=best_epochs,
batch_size=best_batch_size,
validation_data=(
[val_data_dict[key][0].T, val_data_dict[key][0].T], # Transpose validation data
[val_data_dict[key][0].T, val_data_dict[key][0].T] # Transpose validation data
),
verbose=1
)
追溯:
transcriptomics: x shape=(5, 15), y shape=(5, 2)
Epoch 1/2
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [227], in <cell line: 2>()
16 assert key in vae_models, f"Key {key} not found in vae_models"
18 # Fit the VAE model
---> 19 vae_models[key].fit(
20 x=[encoder_inputs, y_input, decoder_inputs],
21 y=[encoder_inputs, y_input], # Adjust labels accordingly
22 epochs=best_epochs,
23 batch_size=best_batch_size,
24 validation_data=(
25 [val_data_dict[key][0].T, val_data_dict[key][0].T], # Transpose validation data
26 [val_data_dict[key][0].T, val_data_dict[key][0].T] # Transpose validation data
27 ),
28 verbose=1
29 )
File /scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /local/scratch/melchua/slrmtmp.42333570/__autograph_generated_filegjukc1vd.py:15, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
ValueError: in user code:
File "/scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/engine/training.py", line 1284, in train_function *
return step_function(self, iterator)
File "/scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/engine/training.py", line 1268, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in run_step **
outputs = model.train_step(data)
File "/scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/engine/training.py", line 1050, in train_step
y_pred = self(x, training=True)
File "/scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/scg/apps/software/jupyter/python_3.9/lib/python3.9/site-packages/keras/engine/input_spec.py", line 298, in assert_input_compatibility
raise ValueError(
ValueError: Input 2 of layer "full_model" is incompatible with the layer: expected shape=(None, 3), found shape=(None, 15)
输入尺寸:
X
是数据集的字典,而y
是多元数据框。 X
索引对应于y
索引
len(X)
3
X['transcriptomics'].shape
(8, 15)
X['epigenomics'].shape
(8, 13)
X['proteomics'].shape
(8, 5)
y.shape
(8, 2)
这会崩溃,因为您将编码器输入提供给解码器,这是没有意义的。如果您不打算单独使用解码器,则根本不需要任何
decoder_inputs
。您可以简单地在 create_vae
函数中链接编码器和解码器:
x_decoded = layers.Dense(input_dim, activation='sigmoid')(z_with_class)
然后完全删除
decoder_inputs
。