我正在使用编码器-解码器架构构建一个 seq2seq 模型,为此我构建了一个
tf.data.Dataset
管道,该管道从目录中读取文本,使用它们进行向量化 tf.keras.layers.TextVectorization
并对其进行预处理以用于模型训练。我无法将我的 labels
格式化为 (None, seq_len, target_vocab_size)
形状。我尝试使用将 tf.utils.to_categorical
映射到标签,但它不适用于张量。奇怪的是,没有任何材料讨论过类似的问题。以下是我的实现:
BUFFER_SIZE = len(articles)
BATCH_SIZE = 64
train_raw = (tf.data.Dataset
.from_tensor_slices((articles[is_train], summaries[is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
val_raw = (tf.data.Dataset
.from_tensor_slices((articles[~is_train], summaries[~is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
context_vectorizer = tf.keras.layers.TextVectorization(
standardize = tf_lower_and_split_punct,
max_tokens = MAX_VOCAB_SIZE,
ragged=True)
target_vectorizer = tf.keras.layers.TextVectorization(
standardize=tf_lower_and_split_punct,
max_tokens=MAX_VOCAB_SIZE,
ragged=True)
context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))
def preprocess_text(context, target):
context = context_vectorizer(context).to_tensor()
target = target_vectorizer(target)
target_in = target[:,:-1].to_tensor()
target_out = target[:,1:].to_tensor()
# target_out = target[:,:-1]
return (context, target_in), target_out
train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)
def encoder(hsize, embed_dim=200):
en_input_layer = Input(shape=(None,), name='encoder_input_layer', ragged=True)
en_embed = Embedding(context_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='encoder_embedding_layer')
en_embed_out = en_embed(en_input_layer)
en_gru_1 = GRU(hsize, return_sequences=True, return_state=True, name='encoder_gru_layer_1')
en_gru_1_out, en_gru_states = en_gru_1(en_embed_out)
return en_input_layer, en_gru_1_out, en_gru_states
def decoder(hsize, encoder_states, embed_dim=200):
de_input_layer = Input(shape=(None,), name='decoder_input_layer', ragged=True)
de_embed = Embedding(target_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='decode_embedding_layer')
de_embed_out = de_embed(de_input_layer)
de_gru_1 = GRU(hsize, return_sequences=True, name='decoder_gru_layer_1')
de_gru_1_out = de_gru_1(de_embed_out, initial_state=encoder_states)
de_dense = TimeDistributed(Dense(target_vectorizer.vocabulary_size(), activation='softmax'), name='time_distributed_output_layer')
de_preds = de_dense(de_gru_1_out)
return de_input_layer, de_preds
hsize = 256
def create_model(hsize):
en_input_layer, enc_out, enc_states = encoder(hsize)
de_input_layer, de_preds = decoder(hsize, enc_states)
model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
model.compile(optimizer='adam', loss='categorical_crossentropy',
metrics=["acc"])
return model
模型概要如下:
Layer (type) Output Shape Param # Connected to
==================================================================================================
encoder_input_layer (Input [(None, None)] 0 []
Layer)
decoder_input_layer (Input [(None, None)] 0 []
Layer)
encoder_embedding_layer (E (None, None, 200) 437200 ['encoder_input_layer[0][0]']
mbedding)
decode_embedding_layer (Em (None, None, 200) 244200 ['decoder_input_layer[0][0]']
bedding)
encoder_gru_layer_1 (GRU) [(None, None, 256), 351744 ['encoder_embedding_layer[0][0
(None, 256)] ]']
decoder_gru_layer_1 (GRU) (None, None, 256) 351744 ['decode_embedding_layer[0][0]
',
'encoder_gru_layer_1[0][1]']
time_distributed_output_la (None, None, 1220) 313540 ['decoder_gru_layer_1[0][0]']
yer (TimeDistributed)
==================================================================================================
Total params: 1698428 (6.48 MB)
Trainable params: 1698428 (6.48 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________
模型编译没问题,但是当我运行
fit
方法时,出现以下错误:
ValueError: Shapes (None, None) and (None, None, 1220) are incompatible
我正在努力正确定义模型的
Input
层,或与模型定义一起使用的 preprocess_text
输出。
从上面的评论重新发布,为了解决上述问题,我们可以更改适用于稀疏向量的损失方法或将目标标签转换为one-hot编码。下面是带有一些虚拟数据的完整工作代码。
make_one_hot = False # params: True, False
num_articles = 1000
num_summaries = 1000
MAX_VOCAB_SIZE = 5000
articles = np.array([f"Article {i}" for i in range(num_articles)])
summaries = np.array([f"Summary {i}" for i in range(num_summaries)])
is_train = np.random.rand(len(articles)) < 0.8
def tf_lower_and_split_punct(text):
text = tf.strings.lower(text)
text = tf.strings.regex_replace(text, '[.?!,¿]', ' ')
text = tf.strings.strip(text)
text = tf.strings.join([' ', text, ' '])
return text
BUFFER_SIZE = len(articles)
BATCH_SIZE = 64
train_raw = (tf.data.Dataset
.from_tensor_slices((articles[is_train], summaries[is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
val_raw = (tf.data.Dataset
.from_tensor_slices((articles[~is_train], summaries[~is_train]))
.shuffle(BUFFER_SIZE)
.batch(BATCH_SIZE))
context_vectorizer = tf.keras.layers.TextVectorization(
standardize = tf_lower_and_split_punct,
max_tokens = MAX_VOCAB_SIZE,
ragged=True)
target_vectorizer = tf.keras.layers.TextVectorization(
standardize=tf_lower_and_split_punct,
max_tokens=MAX_VOCAB_SIZE,
ragged=True)
context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))
def preprocess_text(context, target):
context = context_vectorizer(context).to_tensor()
target = target_vectorizer(target)
target_in = target[:,:-1].to_tensor()
target_out = target[:,1:].to_tensor()
if make_one_hot:
target_out = tf.one_hot(
target_out,
depth=tf.cast(
target_vectorizer.vocabulary_size(), dtype='int32'
)
)
return (context, target_in), target_out
train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)
def create_model(hsize):
en_input_layer, enc_out, enc_states = encoder(hsize)
de_input_layer, de_preds = decoder(hsize, enc_states)
model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
if make_one_hot:
loss_fn = 'categorical_crossentropy'
else:
loss_fn = 'sparse_categorical_crossentropy'
model.compile(
optimizer='adam',
loss=loss_fn,
metrics=["acc"]
)
return model
model.fit(train_ds)
5s 24ms/step - loss: 6.7114 - acc: 0.003
<keras.callbacks.History at 0x7bfef0423f40>
参考