为标记分类任务训练预训练的 BERT 模型时出现内存不足

问题描述 投票:0回答:1

我正在使用预训练的 BertForTokenClassification 进行嵌套命名实体识别任务。为了定义嵌套实体,我使用多标签方法。在输出模型中返回 3 个 logits 列表,每个级别一个,最终组合在一起。我正在具有 16GB RAM 的 Linux Ubuntu 22.04 上运行训练过程。

问题是训练过程因OutOfMemory而中断。无论batch_size是多少:1或16。内存消耗不断增长,进程被杀死。 batch_size越小,最终结果就会晚一点。

模型类:

import torch.nn as nn
from transformers import, BertForTokenClassification


class NestedNERMultiLabelModel(nn.Module):
    def __init__(self, model_name, num_labels_level1, num_labels_level2, num_labels_level3, dropout):
        super(NestedNERMultiLabelModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained(model_name, hidden_dropout_prob=dropout)

        self.classifier_level1 = nn.Linear(self.bert.config.hidden_size, num_labels_level1)

        self.classifier_level2 = nn.Linear(self.bert.config.hidden_size, num_labels_level2)

        self.classifier_level3 = nn.Linear(self.bert.config.hidden_size, num_labels_level3)

    def forward(self, input_ids, attention_mask=None):

        outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        out = outputs.hidden_states[-1]

        logits_level1 = self.classifier_level1(out)

        logits_level2 = self.classifier_level2(out)

        logits_level3 = self.classifier_level3(out)

        return logits_level1, logits_level2, logits_level3

培训模块:

import torch
from transformers import get_linear_schedule_with_warmup

from NestedNERMultiLabelModel import NestedNERMultiLabelModel
import torch.nn.functional as F
from tqdm.auto import tqdm

class Trainer:

    def __init__(self, config, preprocessor):
        self.config = config
        self.preprocessor = preprocessor
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = NestedNERMultiLabelModel(config["bert_model_name"], config["num_labels"], config["num_labels"], config["num_labels"], config['dropout_rate'])
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=config["learning_rate"],
            weight_decay=self.config["weight_decay"]
        )
        self.start_epoch = 0

        self.model = self.model.to(self.device)
        self.epochs = config["num_epochs"]

    def train(self, train_loader, valid_loader):

        num_training_steps = len(train_loader) * (self.epochs - self.start_epoch)

        scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=500,
            num_training_steps=num_training_steps
        )

        best_loss = 1000
        with tqdm(range(num_training_steps)) as progress_bar:
            for epoch in range(self.start_epoch, self.epochs):

                train_loss = 0
                self.model.train()

                for input_ids, attention_mask, labels in train_loader:
                    input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)

                    self.optimizer.zero_grad()
                    labels_pred = self.model(input_ids, attention_mask)

                    total_loss = 0
                    for i in range(3):
                        loss = F.cross_entropy(labels_pred[i].view(-1, 8), labels[:, i].reshape(-1), ignore_index=0)
                        total_loss += loss

                    # Update model weights
                    total_loss.backward()
                    train_loss += total_loss

                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=self.config["grad_norm"])
                    self.optimizer.step()
                    scheduler.step()
                    progress_bar.update(1)

                train_loss = train_loss / num_training_steps

                with torch.no_grad():
                    self.model.eval()
                    eval_loss = self.evaluate(self.model, valid_loader)
                    print(f'Epoch: {epoch} | train_loss: {train_loss:<5} | eval_loss: {eval_loss:<5}')
                    self.model.train()

                if eval_loss < best_loss:
                    self.save_checkpoint(self.model, self.optimizer, epoch)
                    best_loss = eval_loss
                    data = [self.config['train_batch_size'], self.config['learning_rate'], self.config['dropout_rate'],
                            self.config['weight_decay'], self.config['grad_norm'], round(train_loss, 3),
                            round(eval_loss, 3)]
                    with open("test.txt", "a") as f:
                        f.write(" | ".join(f"{str(item):<13}" for item in data) + "\n")

    def evaluate(self, model, dataloader):

        true_labels = {'labels_0': [], 'labels_1': [], 'labels_2': []}
        pred_labels = {'labels_0': [], 'labels_1': [], 'labels_2': []}
        eval_loss = 0

        for input_ids, attention_mask, labels in dataloader:
            input_ids, attention_mask = input_ids.to(self.device), attention_mask.to(self.device)
            labels_pred = model(input_ids, attention_mask)

            total_loss = 0
            for i in range(3):
                loss = F.cross_entropy(labels_pred[i].view(-1, 8), labels[:, i].reshape(-1), ignore_index=0)
                total_loss += loss

                labels_pred_logits = labels_pred[i].argmax(dim=-1)
                labels_pred_logits = labels_pred_logits.detach().cpu().tolist()
                labels_true = labels[:, i].detach().cpu().tolist()
                for l in labels_pred_logits:
                    pred_labels[f'labels_{i}'].append(l)
                for l in labels_true:
                    true_labels[f'labels_{i}'].append(l)

            eval_loss += total_loss

        eval_loss /= len(dataloader)

        return eval_loss

    def save_checkpoint(self, model, optimizer, epoch):
        checkpoint_path = f"{self.config['checkpoints_dir']}/model_{epoch}.pth"
        torch.save(
            {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'id_to_label': self.preprocessor.id_to_label,
                'max_len': self.preprocessor.max_len,
                'batch_size': self.config["train_batch_size"],
                'epoch': epoch
            },
            checkpoint_path
        )
        print(f"Model saved to: {self.config['checkpoints_dir']}/model_{epoch}.pth")

config.json 文件:

{
  "datasets_dir": "datasets/",
  "checkpoints_dir": "models/",
  "train_dataset": "train_t23_v1.jsonl",
  "bert_model_name": "DeepPavlov/rubert-base-cased",
  "train_samples_start_idx": 0,
  "train_samples_end_idx": 640,
  "valid_samples_start_idx": 640,
  "valid_samples_end_idx": 800,
  "test_samples_start_idx": 800,
  "test_samples_end_idx": 840,
  "train_batch_size": 4,
  "valid_batch_size": 4,
  "test_batch_size": 1,
  "max_sequence_len": 255,
  "num_labels": 8,
  "learning_rate": 2e-5,
  "num_epochs": 5,
  "dropout_rate": 0.2,
  "weight_decay": 0.0005,
  "grad_norm": 1.0
}

但在具有相同 RAM 可用性的 Windows 11 上不会发生这种情况。唯一的区别是训练速度慢了大约3倍。

python pytorch huggingface-transformers bert-language-model named-entity-recognition
1个回答
0
投票

你只使用CPU吗?看来你需要 GPU 来运行这样的模型

© www.soinside.com 2019 - 2024. All rights reserved.