如何为编码器-解码器模型格式化不规则张量?

问题描述 投票:0回答:1

我正在使用编码器-解码器架构构建一个 seq2seq 模型,为此我构建了一个

tf.data.Dataset
管道,该管道从目录中读取文本,使用它们进行向量化
tf.keras.layers.TextVectorization
并对其进行预处理以用于模型训练。我无法将我的
labels
格式化为
(None, seq_len, target_vocab_size)
形状。我尝试使用将
tf.utils.to_categorical
映射到标签,但它不适用于张量。奇怪的是,没有任何材料讨论过类似的问题。以下是我的实现:

BUFFER_SIZE = len(articles)
BATCH_SIZE = 64

train_raw = (tf.data.Dataset
             .from_tensor_slices((articles[is_train], summaries[is_train]))
             .shuffle(BUFFER_SIZE)
             .batch(BATCH_SIZE))

val_raw = (tf.data.Dataset
           .from_tensor_slices((articles[~is_train], summaries[~is_train]))
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE))

context_vectorizer = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = MAX_VOCAB_SIZE,
    ragged=True)

target_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))

def preprocess_text(context, target):
    context = context_vectorizer(context).to_tensor()
    target = target_vectorizer(target)

    target_in = target[:,:-1].to_tensor()
    target_out = target[:,1:].to_tensor()
    # target_out = target[:,:-1]
    return (context, target_in), target_out

train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)

def encoder(hsize, embed_dim=200):
    en_input_layer = Input(shape=(None,), name='encoder_input_layer', ragged=True)
    en_embed = Embedding(context_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='encoder_embedding_layer')
    en_embed_out = en_embed(en_input_layer)
    en_gru_1 = GRU(hsize, return_sequences=True, return_state=True, name='encoder_gru_layer_1')
    en_gru_1_out, en_gru_states = en_gru_1(en_embed_out)
    return en_input_layer, en_gru_1_out, en_gru_states

def decoder(hsize, encoder_states, embed_dim=200):
    de_input_layer = Input(shape=(None,), name='decoder_input_layer', ragged=True)
    de_embed = Embedding(target_vectorizer.vocabulary_size()+1, output_dim=embed_dim, name='decode_embedding_layer')
    de_embed_out = de_embed(de_input_layer)
    de_gru_1 = GRU(hsize, return_sequences=True, name='decoder_gru_layer_1')
    de_gru_1_out = de_gru_1(de_embed_out, initial_state=encoder_states)
    de_dense = TimeDistributed(Dense(target_vectorizer.vocabulary_size(), activation='softmax'), name='time_distributed_output_layer')
    de_preds = de_dense(de_gru_1_out)
    return de_input_layer, de_preds

hsize = 256

def create_model(hsize):
    en_input_layer, enc_out, enc_states = encoder(hsize)
    de_input_layer, de_preds = decoder(hsize, enc_states)
    model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
    model.compile(optimizer='adam', loss='categorical_crossentropy',
                    metrics=["acc"])
    return model

模型概要如下:


 Layer (type)                Output Shape                 Param #   Connected to                  
==================================================================================================
 encoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 decoder_input_layer (Input  [(None, None)]               0         []                            
 Layer)                                                                                           
                                                                                                  
 encoder_embedding_layer (E  (None, None, 200)            437200    ['encoder_input_layer[0][0]'] 
 mbedding)                                                                                        
                                                                                                  
 decode_embedding_layer (Em  (None, None, 200)            244200    ['decoder_input_layer[0][0]'] 
 bedding)                                                                                         
                                                                                                  
 encoder_gru_layer_1 (GRU)   [(None, None, 256),          351744    ['encoder_embedding_layer[0][0
                              (None, 256)]                          ]']                           
                                                                                                  
 decoder_gru_layer_1 (GRU)   (None, None, 256)            351744    ['decode_embedding_layer[0][0]
                                                                    ',                            
                                                                     'encoder_gru_layer_1[0][1]'] 
                                                                                                  
 time_distributed_output_la  (None, None, 1220)           313540    ['decoder_gru_layer_1[0][0]'] 
 yer (TimeDistributed)                                                                            
                                                                                                  
==================================================================================================
Total params: 1698428 (6.48 MB)
Trainable params: 1698428 (6.48 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________

模型编译没问题,但是当我运行

fit
方法时,出现以下错误:

ValueError: Shapes (None, None) and (None, None, 1220) are incompatible

我正在努力正确定义模型的

Input
层,或与模型定义一起使用的
preprocess_text
输出。

python tensorflow keras deep-learning nlp
1个回答
0
投票

从上面的评论重新发布,为了解决上述问题,我们可以更改适用于稀疏向量的损失方法将目标标签转换为one-hot编码。下面是带有一些虚拟数据的完整工作代码。

make_one_hot = False # params: True, False

num_articles = 1000
num_summaries = 1000
MAX_VOCAB_SIZE = 5000
articles = np.array([f"Article {i}" for i in range(num_articles)])
summaries = np.array([f"Summary {i}" for i in range(num_summaries)])
is_train = np.random.rand(len(articles)) < 0.8

def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[.?!,¿]', ' ')
    text = tf.strings.strip(text)
    text = tf.strings.join([' ', text, ' '])
    return text
BUFFER_SIZE = len(articles)
BATCH_SIZE = 64

train_raw = (tf.data.Dataset
             .from_tensor_slices((articles[is_train], summaries[is_train]))
             .shuffle(BUFFER_SIZE)
             .batch(BATCH_SIZE))

val_raw = (tf.data.Dataset
           .from_tensor_slices((articles[~is_train], summaries[~is_train]))
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE))

context_vectorizer = tf.keras.layers.TextVectorization(
    standardize = tf_lower_and_split_punct,
    max_tokens = MAX_VOCAB_SIZE,
    ragged=True)

target_vectorizer = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

context_vectorizer.adapt(train_raw.map(lambda context, target: context))
target_vectorizer.adapt(train_raw.map(lambda context, target: target))
def preprocess_text(context, target):
    context = context_vectorizer(context).to_tensor()
    target = target_vectorizer(target)

    target_in = target[:,:-1].to_tensor()
    target_out = target[:,1:].to_tensor()
    
    if make_one_hot:
        target_out = tf.one_hot(
            target_out, 
            depth=tf.cast(
                target_vectorizer.vocabulary_size(), dtype='int32'
            )
        )
    return (context, target_in), target_out

train_ds = train_raw.map(preprocess_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(preprocess_text, tf.data.AUTOTUNE)
def create_model(hsize):
    en_input_layer, enc_out, enc_states = encoder(hsize)
    de_input_layer, de_preds = decoder(hsize, enc_states)
    model = Model(inputs=[en_input_layer, de_input_layer], outputs=de_preds)
    
    if make_one_hot:
        loss_fn = 'categorical_crossentropy'
    else:
        loss_fn = 'sparse_categorical_crossentropy'
    
    model.compile(
        optimizer='adam', 
        loss=loss_fn,
        metrics=["acc"]
    )
    return model


model.fit(train_ds)
5s 24ms/step - loss: 6.7114 - acc: 0.003
<keras.callbacks.History at 0x7bfef0423f40>

参考

为 Tensorflow 模型选择损失和指标

© www.soinside.com 2019 - 2024. All rights reserved.