问题: 我尝试在一篇论文中使用 bert 模型,用 NPL 模型对我的序列进行编码,但这需要很多时间,并且在终止他的 1 epoch 之前,他会遇到连接问题,当我将批量大小增加到 16 或 32 时我遇到内存问题,这是我的代码,所以如果我有问题,请告诉我解决它,这样我就可以摆脱这个问题:
Python代码:
这是我正在使用的代码片段: 将 pandas 导入为 pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from Bio import SeqIO
data = "/content/dataset_Rfam_6320_13classes.fasta"
sequence_dict = {rec.id: \[str(rec.seq).upper(), rec.description.split()\[1\]\] for rec in SeqIO.parse(data, "fasta")}
data = pd.DataFrame.from_dict(sequence_dict, orient="index", columns=\["Seq", "RNA_type"\])
data\["length"\] = data\["Seq"\].map(len)
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
print("=\> saving checkpoint")
torch.save(state, filename )
` `def load_checkpoint(checkpoint):
print("=\> Loading checkpoint")
model.load_state_dict(checkpoint\['state_dict'\])
train_X, test_X, train_Y, test_Y = train_test_split(data\['Seq'\], data\['RNA_type'\], train_size = 0.7, shuffle = 42)
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
)
tokenizer = AutoTokenizer.from_pretrained(
'Lancelot53/rna-tokenizer-4096', do_lower_case=False
)
def return_kmer(seq, K=3):
kmer_list = \[\]
for x in range(len(seq) - K + 1): # move a window of size K across the sequence
kmer_list.append(seq\[x : x + K\])
kmer_seq = " ".join(kmer_list)
return kmer_seq
train_kmers = \[return_kmer(seq) for seq in train_X\]
test_kmers = \[return_kmer(seq) for seq in test_X\]
train_encodings = tokenizer.batch_encode_plus(
train_kmers,
max_length=512, # max len of BERT
padding=True,
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)\`
\`test_encodings = tokenizer.batch_encode_plus(
test_kmers,
max_length=512, # max len of BERT
padding=True,
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)
class TokenData(Dataset):
def __init__(self, train=False):
if train:
self.text_data = train_X
self.tokens = train_encodings
self.labels = list(train_Y)
else:
self.text_data = test_X
self.tokens = test_encodings
self.labels = list(test_Y)
self.label_encoder = {label: i for i, label in enumerate(set(self.labels))}
self.num_classes = len(self.label_encoder)
def __len__(self):
return len(self.text_data)
def __getitem__(self, idx):
sample = {}
for k, v in self.tokens.items():
sample[k] = torch.tensor(v[idx])
label = self.labels[idx]
encoded_label = self.label_encoder[label]
sample['labels'] = torch.tensor(encoded_label, dtype=torch.long)
return sample`
\`batch_size = 8
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)
optimizer = AdamW(bert_model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 3 # nombre de fois que vous souhaitez parcourir l'ensemble de vos données d'entraînement.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" # la variable device indique le dispositif (hardware) sur lequel le calcul sera effectué.
bert_model.to(device) # Transfer model to GPU if available
load_model= False
if load_model:
load_checkpoint(torch.load("my_checkpoint.ptch.tar"))\`
\`for epoch in range(num_epochs):
if epoch \>= 1:
checkpoint ={'stat_dict': bert_model.state_dict()}
save_checkpoint(checkpoint)
print("Epoch: ",(epoch + 1))
\# TRAINING BLOCK STARTS
bert_model.train()
for i,batch in enumerate(train_loader):
batch = {k: v.to(device) for k, v in batch.items()}
# Setting the gradients to zero
optimizer.zero_grad()
# Passing the data to the model
outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
# The logits will be used for measuring the loss
pred = outputs.logits
loss = loss_fn(pred, batch['labels'])
# Calculating the gradient for the loss function
loss.backward()
# Optimizing the parameters of the bert model
optimizer.step()
# Calculating the running loss for logging purposes
train_batch_loss = loss.item()
train_last_loss = train_batch_loss / batch_size
print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
# Logging epoch-wise training loss
print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
# TRAINING BLOCK ENDS
# TESTING BLOCK STARTS
bert_model.eval()
correct = 0
test_pred = []
for i, batch in enumerate(test_loader):
batch = {k: v.to(device) for k, v in batch.items()}
# We don't need gradients for testing
with torch.no_grad():
outputs = bert_model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
# Logits act as predictions
logits = outputs.logits
# Calculating total batch loss using the logits and labels
loss = loss_fn(logits, batch['labels'])
test_batch_loss = loss.item()
# Calculating the mean batch loss
test_last_loss = test_batch_loss / batch_size
print('Testing batch {} loss: {}'.format(i + 1, test_last_loss))
# Comparing the predicted target with the labels in the batch
correct += (logits.argmax(1) == batch['labels']).sum().item()
print("Testing accuracy: ",correct/((i + 1) * batch_size))
print(f"\nTesting epoch {epoch + 1} last loss: ",test_last_loss)
# TESTING BLOCK ENDS `
我想要什么: 我只需要一种方法来执行我的模型,即使在一个时期内也不会断开连接,因为在我尝试这个 bert 模型之后,我将对代码进行另一次修改,这样它将有另一个分类器部分
我的意见,你可以试试:
training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)