我正在使用预训练的 BertForTokenClassification 进行嵌套命名实体识别任务。为了定义嵌套实体,我使用多标签方法。在输出模型中返回 3 个 logits 列表,每个级别一个,最终组合在一起。我正在具有 16GB RAM 的 Linux Ubuntu 22.04 上运行训练过程。
问题是训练过程因OutOfMemory而中断。无论batch_size是多少:1或16。内存消耗不断增长,进程被杀死。 batch_size越小,最终结果就会晚一点。
模型类:
import torch.nn as nn
from transformers import, BertForTokenClassification
class NestedNERMultiLabelModel(nn.Module):
def __init__(self, model_name, num_labels_level1, num_labels_level2, num_labels_level3, dropout):
super(NestedNERMultiLabelModel, self).__init__()
self.bert = BertForTokenClassification.from_pretrained(model_name, hidden_dropout_prob=dropout)
self.classifier_level1 = nn.Linear(self.bert.config.hidden_size, num_labels_level1)
self.classifier_level2 = nn.Linear(self.bert.config.hidden_size, num_labels_level2)
self.classifier_level3 = nn.Linear(self.bert.config.hidden_size, num_labels_level3)
def forward(self, input_ids, attention_mask=None):
outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
out = outputs.hidden_states[-1]
logits_level1 = self.classifier_level1(out)
logits_level2 = self.classifier_level2(out)
logits_level3 = self.classifier_level3(out)
return logits_level1, logits_level2, logits_level3
培训模块:
import torch
from transformers import get_linear_schedule_with_warmup
from NestedNERMultiLabelModel import NestedNERMultiLabelModel
import torch.nn.functional as F
from tqdm.auto import tqdm
class Trainer:
def __init__(self, config, preprocessor):
self.config = config
self.preprocessor = preprocessor
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model = NestedNERMultiLabelModel(config["bert_model_name"], config["num_labels"], config["num_labels"], config["num_labels"], config['dropout_rate'])
self.optimizer = torch.optim.AdamW(
self.model.parameters(),
lr=config["learning_rate"],
weight_decay=self.config["weight_decay"]
)
self.start_epoch = 0
self.model = self.model.to(self.device)
self.epochs = config["num_epochs"]
def train(self, train_loader, valid_loader):
num_training_steps = len(train_loader) * (self.epochs - self.start_epoch)
scheduler = get_linear_schedule_with_warmup(
self.optimizer,
num_warmup_steps=500,
num_training_steps=num_training_steps
)
best_loss = 1000
with tqdm(range(num_training_steps)) as progress_bar:
for epoch in range(self.start_epoch, self.epochs):
train_loss = 0
self.model.train()
for input_ids, attention_mask, labels in train_loader:
input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
self.optimizer.zero_grad()
labels_pred = self.model(input_ids, attention_mask)
total_loss = 0
for i in range(3):
loss = F.cross_entropy(labels_pred[i].view(-1, 8), labels[:, i].reshape(-1), ignore_index=0)
total_loss += loss
# Update model weights
total_loss.backward()
train_loss += total_loss
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.config["grad_norm"])
self.optimizer.step()
scheduler.step()
progress_bar.update(1)
train_loss = train_loss / num_training_steps
with torch.no_grad():
self.model.eval()
eval_loss = self.evaluate(self.model, valid_loader)
print(f'Epoch: {epoch} | train_loss: {train_loss:<5} | eval_loss: {eval_loss:<5}')
self.model.train()
if eval_loss < best_loss:
self.save_checkpoint(self.model, self.optimizer, epoch)
best_loss = eval_loss
data = [self.config['train_batch_size'], self.config['learning_rate'], self.config['dropout_rate'],
self.config['weight_decay'], self.config['grad_norm'], round(train_loss, 3),
round(eval_loss, 3)]
with open("test.txt", "a") as f:
f.write(" | ".join(f"{str(item):<13}" for item in data) + "\n")
def evaluate(self, model, dataloader):
true_labels = {'labels_0': [], 'labels_1': [], 'labels_2': []}
pred_labels = {'labels_0': [], 'labels_1': [], 'labels_2': []}
eval_loss = 0
for input_ids, attention_mask, labels in dataloader:
input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
labels_pred = model(input_ids, attention_mask)
total_loss = 0
for i in range(3):
loss = F.cross_entropy(labels_pred[i].view(-1, 8), labels[:, i].reshape(-1), ignore_index=0)
total_loss += loss
labels_pred_logits = labels_pred[i].argmax(dim=-1)
labels_pred_logits = labels_pred_logits.detach().cpu().tolist()
labels_true = labels[:, i].detach().cpu().tolist()
for l in labels_pred_logits:
pred_labels[f'labels_{i}'].append(l)
for l in labels_true:
true_labels[f'labels_{i}'].append(l)
eval_loss += total_loss
eval_loss /= len(dataloader)
return eval_loss
def save_checkpoint(self, model, optimizer, epoch):
checkpoint_path = f"{self.config['checkpoints_dir']}/model_{epoch}.pth"
torch.save(
{
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'id_to_label': self.preprocessor.id_to_label,
'max_len': self.preprocessor.max_len,
'batch_size': self.config["train_batch_size"],
'epoch': epoch
},
checkpoint_path
)
print(f"Model saved to: {self.config['checkpoints_dir']}/model_{epoch}.pth")
config.json 文件:
{
"datasets_dir": "datasets/",
"checkpoints_dir": "models/",
"train_dataset": "train_t23_v1.jsonl",
"bert_model_name": "DeepPavlov/rubert-base-cased",
"train_samples_start_idx": 0,
"train_samples_end_idx": 640,
"valid_samples_start_idx": 640,
"valid_samples_end_idx": 800,
"test_samples_start_idx": 800,
"test_samples_end_idx": 840,
"train_batch_size": 4,
"valid_batch_size": 4,
"test_batch_size": 1,
"max_sequence_len": 255,
"num_labels": 8,
"learning_rate": 2e-5,
"num_epochs": 5,
"dropout_rate": 0.2,
"weight_decay": 0.0005,
"grad_norm": 1.0
}
但在具有相同 RAM 可用性的 Windows 11 上不会发生这种情况。唯一的区别是训练速度慢了大约3倍。
你只使用CPU吗?看来你需要 GPU 来运行这样的模型