from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
def train_one_epoch(model, dataloader, optimizer):
model.train()
loss_list = []
for batch in tqdm(dataloader):
batch_data = {
'input_ids': batch['input_ids'],
'attention_mask': batch['attention_mask'],
'labels': batch['labels']
}
loss = model(**batch_data).loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_list.append(loss.detach().item())
avg_loss = sum(loss_list) / len(loss_list)
print('avg loss in epoch:', avg_loss)
def evaluate(model, dataloader):
model.eval()
all_labels = []
all_predictions = []
for batch in dataloader:
with torch.no_grad():
batch_data = {
'input_ids': batch['input_ids'],
'attention_mask': batch['attention_mask']
}
logits = model(**batch_data).logits
predictions = torch.argmax(logits, dim=-1)
labels = batch['labels']
all_labels.extend(labels)
all_predictions.extend(predictions)
accuracy = compute_accuracy(all_predictions, all_labels)
print("Accuracy", accuracy)
return accuracy
def compute_accuracy(predictions, labels):
correct = 0
for pred, label in zip(predictions, labels):
if pred == label:
correct += 1
return correct / len(labels)
def my_collate_fn(batched_samples):
texts = [example['text'] for example in batched_samples]
labels = [example['label'] for example in batched_samples]
text_encoding = tokenizer(texts, max_length=128, truncation=True, padding=True, return_tensors='pt')
labels = torch.LongTensor(labels)
return {
'input_ids': text_encoding['input_ids'].cuda(),
'attention_mask': text_encoding['attention_mask'].cuda(),
'labels': labels.cuda()
}
torch.manual_seed(64)
batch_size = 16
learning_rate = 5e-5
num_epochs = 10
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.cuda()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate, eps=1e-8)
datasets = load_dataset("gpt3mix/sst2")
train_dataloader = DataLoader(
datasets['train'],
batch_size=8,
shuffle=True,
collate_fn=my_collate_fn,
num_workers=0
)
validation_dataloader = DataLoader(
datasets['validation'],
batch_size=8,
shuffle=False,
collate_fn=my_collate_fn,
num_workers=0
)
best_acc = 0.0
for epoch in range(1, num_epochs + 1):
train_one_epoch(model, train_dataloader, optimizer)
valid_acc = evaluate(model, validation_dataloader)
100%|██████████| 865/865 [01:27<00:00, 9.89it/s]
epoch 的平均损失:0.6746856869559068
准确度0.4908256880733945
100%|██████████| 865/865 [01:25<00:00, 10.09it/s]
纪元平均损失:0.6922555248516833
准确度0.4908256880733945
100%|██████████| 865/865 [01:27<00:00, 9.89it/s]
epoch 的平均损失:0.6976809655310791
准确度0.5091743119266054
改变学习率也不起作用
您没有指定 SequenceClassification 模型中的标签数量,这通常允许模型为每个数据点预测相同的类别。
只需将加载模型的部分修改为:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)