使用bert模型的google colab内存问题

问题描述 投票:0回答:1

问题: 我尝试在一篇论文中使用 bert 模型,用 NPL 模型对我的序列进行编码,但这需要很多时间,并且在终止他的 1 epoch 之前,他会遇到连接问题,当我将批量大小增加到 16 或 32 时我遇到内存问题,这是我的代码,所以如果我有问题,请告诉我解决它,这样我就可以摆脱这个问题:

Python代码:

这是我正在使用的代码片段: 将 pandas 导入为 pd

from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from Bio import SeqIO

data = "/content/dataset_Rfam_6320_13classes.fasta"

sequence_dict = {rec.id: \[str(rec.seq).upper(), rec.description.split()\[1\]\] for rec in SeqIO.parse(data, "fasta")}
data = pd.DataFrame.from_dict(sequence_dict, orient="index", columns=\["Seq", "RNA_type"\])
data\["length"\] = data\["Seq"\].map(len)

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=\> saving checkpoint")
torch.save(state, filename )
` `def load_checkpoint(checkpoint):
print("=\> Loading checkpoint")
model.load_state_dict(checkpoint\['state_dict'\])

train_X, test_X, train_Y, test_Y = train_test_split(data\['Seq'\], data\['RNA_type'\], train_size = 0.7, shuffle = 42)

from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
)

tokenizer = AutoTokenizer.from_pretrained(
'Lancelot53/rna-tokenizer-4096', do_lower_case=False
)

def return_kmer(seq, K=3):
kmer_list = \[\]
for x in range(len(seq) - K + 1):  # move a window of size K across the sequence
kmer_list.append(seq\[x : x + K\])

    kmer_seq = " ".join(kmer_list)
    return kmer_seq

train_kmers = \[return_kmer(seq) for seq in train_X\]
test_kmers = \[return_kmer(seq) for seq in test_X\]

train_encodings = tokenizer.batch_encode_plus(
train_kmers,
max_length=512,  # max len of BERT
padding=True,
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)\`

\`test_encodings = tokenizer.batch_encode_plus(
test_kmers,
max_length=512,  # max len of BERT
padding=True,
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)

class TokenData(Dataset):
def __init__(self, train=False):
if train:
self.text_data = train_X
self.tokens = train_encodings
self.labels = list(train_Y)
else:
self.text_data = test_X
self.tokens = test_encodings
self.labels = list(test_Y)

        self.label_encoder = {label: i for i, label in enumerate(set(self.labels))}
        self.num_classes = len(self.label_encoder)
    
    def __len__(self):
        return len(self.text_data)
    
    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
    
        label = self.labels[idx]
        encoded_label = self.label_encoder[label]
        sample['labels'] = torch.tensor(encoded_label, dtype=torch.long)
        return sample`

\`batch_size = 8
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

optimizer = AdamW(bert_model.parameters(), lr=1e-5)

loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 3 # nombre de fois que vous souhaitez parcourir l'ensemble de vos données d'entraînement.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" # la variable device indique le dispositif (hardware) sur lequel le calcul sera effectué.

bert_model.to(device) # Transfer model to GPU if available

load_model= False
if load_model:
load_checkpoint(torch.load("my_checkpoint.ptch.tar"))\`

\`for epoch in range(num_epochs):
if epoch \>= 1:
checkpoint ={'stat_dict': bert_model.state_dict()}
save_checkpoint(checkpoint)
print("Epoch: ",(epoch + 1))
\# TRAINING BLOCK STARTS
bert_model.train()
for i,batch in enumerate(train_loader):
batch = {k: v.to(device) for k, v in batch.items()}

        # Setting the gradients to zero
        optimizer.zero_grad()
    
        # Passing the data to the model
        outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
    
        # The logits will be used for measuring the loss
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])
    
        # Calculating the gradient for the loss function
        loss.backward()
    
        # Optimizing the parameters of the bert model
        optimizer.step()
    
        # Calculating the running loss for logging purposes
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / batch_size
    
        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    # Logging epoch-wise training loss
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    # TRAINING BLOCK ENDS
    
    # TESTING BLOCK STARTS
    bert_model.eval()
    correct = 0
    test_pred = []
    for i, batch in enumerate(test_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
    
        # We don't need gradients for testing
        with torch.no_grad():
            outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
    
        # Logits act as predictions
        logits = outputs.logits
    
        # Calculating total batch loss using the logits and labels
        loss = loss_fn(logits, batch['labels'])
        test_batch_loss = loss.item()
    
        # Calculating the mean batch loss
        test_last_loss = test_batch_loss / batch_size
        print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
    
        # Comparing the predicted target with the labels in the batch
        correct += (logits.argmax(1) == batch['labels']).sum().item()
        print("Testing accuracy: ",correct/((i + 1) * batch_size))
    
    print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)
    # TESTING BLOCK ENDS ` 

我想要什么: 我只需要一种方法来执行我的模型,即使在一个时期内也不会断开连接,因为在我尝试这个 bert 模型之后,我将对代码进行另一次修改,这样它将有另一个分类器部分

deep-learning transform huggingface-transformers bert-language-model
1个回答
0
投票

我的意见,你可以试试:

  • 使用“adafactor”优化器 [https://huggingface.co/docs/transformers/main_classes/optimizer_schedules][1]
  • 梯度累积 [https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation][2]
  • 使用多 GPU 训练模型以共享每个 GPU 的参数。
  • 使用FT_16训练。例如:
    training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
© www.soinside.com 2019 - 2024. All rights reserved.