我使用 BERT 创建了一个 NER 模型来检测医疗实体,效果非常好。我正在尝试在 BERT 模型之上添加 CRF 层以增强其性能,但出现了一个我似乎无法解决的错误。
这是错误:
ValueError Traceback (most recent call last)
<ipython-input-32-99c3c401704b> in <cell line: 85>()
83
84 # Start training
---> 85 trainer.train()
7 frames
/usr/local/lib/python3.10/dist-packages/torchcrf/__init__.py in _validate(self, emissions, tags, mask)
165 no_empty_seq_bf = self.batch_first and mask[:, 0].all()
166 if not no_empty_seq and not no_empty_seq_bf:
--> 167 raise ValueError('mask of the first timestep must all be on')
168
169 def _compute_score(
ValueError: mask of the first timestep must all be on
至于我的代码,这里是:
from transformers import TrainingArguments, Trainer
from torchcrf import CRF
import torch.nn as nn
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer, BertTokenizerFast
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples[f"ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
label_all_tokens = False
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
tokenized_data = my_dataset_dict.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
class BERT_CRF_Model(nn.Module):
def __init__(self, bert_model, num_labels):
super(BERT_CRF_Model, self).__init__()
self.bert = bert_model
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
self.crf = CRF(num_labels, batch_first=True)
def forward(self, input_ids, attention_mask, labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
sequence_output = self.dropout(outputs[0]) # Last hidden state
emissions = self.classifier(sequence_output)
if labels is not None:
# CRF loss
loss = -self.crf(emissions, labels, mask=attention_mask.bool(), reduction='mean')
return loss
else:
# CRF decoding (prediction)
prediction = self.crf.decode(emissions, mask=attention_mask.bool())
return emissions # Make sure to return emissions here
class CustomTrainer(Trainer):
def __init__(self, *args, crf_layer=None, **kwargs):
super().__init__(*args, **kwargs)
self.crf_layer = crf_layer
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels") # Extraire les labels
emissions = model(**inputs) # Obtenir les émissions du modèle
emissions = torch.stack(emissions) if isinstance(emissions, list) else emissions
# Vérifiez le masque d'attention
mask = inputs["attention_mask"].bool()
if mask.size(0) == 0 or mask[:, 0].sum() == 0:
raise ValueError("Le masque du premier pas de temps doit être activé")
# Calculer la perte CRF
loss = -self.crf_layer(emissions, labels, mask=mask)
return (loss, inputs) if return_outputs else loss
# Load BERT model
from transformers import BertModel
bert_model = BertModel.from_pretrained("bert-base-cased")
model = BERT_CRF_Model(bert_model, num_labels=len(unique_labels))
crf_layer = CRF(num_tags=len(unique_labels))
training_args = TrainingArguments(
output_dir="my_awesome_ner_model",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=12,
per_device_eval_batch_size=12,
num_train_epochs=1,
weight_decay=0.01,
push_to_hub=True,
)
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["val"],
tokenizer=tokenizer,
data_collator=data_collator,
crf_layer=crf_layer # Pass the CRF layer
)
trainer.train()
我不确定为什么会出现此错误,任何帮助将不胜感激!
pytorch-crf,期望所有 first 标记都被揭开,not接受
-100
作为填充标记 id(只有 [0, num_labels)
中的 id,它期望 Torch 张量,当然期望打开因此,请执行以下操作:
mask = labels != self.pad_token_id
mask[:, 0] = True
emissions_torch = torch.from_numpy(logits).float().to(self.device)
mask_torch = torch.from_numpy(mask).bool().to(self.device)
predictions = self.model.crf.decode(emissions=emissions_torch, mask=mask_torch)
祝你好运。