我有以下代码,用于识别用于正确预测测试数据集中的文本的最有影响力的单词
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from captum.attr import IntegratedGradients
# Loading data
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def preprocess_data(df, tokenizer, max_len=128):
inputs = tokenizer(list(df['text']), padding=True, truncation=True, max_length=max_len, return_tensors="pt")
labels = torch.tensor(df['label'].values)
return inputs, labels
train_inputs, train_labels = preprocess_data(train_df, tokenizer)
test_inputs, test_labels = preprocess_data(test_df, tokenizer)
# DataLoader
train_dataset = torch.utils.data.TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
test_loader = DataLoader(test_dataset, batch_size=16)
# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
# Training Loop
model.train()
for epoch in range(3): # Train for 3 epochs
for batch in train_loader:
input_ids, attention_mask, labels = [x.to(device) for x in batch]
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1} loss: {loss.item()}")
# Evaluation
model.eval()
correct_predictions = []
with torch.no_grad():
for batch in test_loader:
input_ids, attention_mask, labels = [x.to(device) for x in batch]
outputs = model(input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, dim=1)
correct_predictions.extend(
(preds == labels).cpu().numpy().tolist()
)
accuracy = accuracy_score(test_labels.numpy(), correct_predictions)
print(f"Test Accuracy: {accuracy:.2f}")
# Integrated Gradients
ig = IntegratedGradients(model)
def get_influential_words(input_text, model, tokenizer, ig, device):
model.eval()
# Tokenizing the input text
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
input_ids = inputs['input_ids'].to(device, dtype=torch.long) # Explicitly convert to LongTensor
attention_mask = inputs['attention_mask'].to(device, dtype=torch.long) # Explicitly convert to LongTensor
print("Input IDs shape:", input_ids.shape, "dtype:", input_ids.dtype)
print("Attention mask shape:", attention_mask.shape, "dtype:", attention_mask.dtype)
# forward function for IG
def forward_func(input_ids):
outputs = model(input_ids, attention_mask=attention_mask)
return outputs.logits
# Applying Integrated Gradients
attributions, delta = ig.attribute(input_ids, target=1, return_convergence_delta=True)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
token_importances = attributions.sum(dim=2).squeeze(0).detach().cpu().numpy()
return list(zip(tokens, token_importances))
# Analysing influential words for correctly predicted texts
for idx, correct in enumerate(correct_predictions):
if correct:
influential_words = get_influential_words(test_df['text'].iloc[idx], model, tokenizer, ig, device)
print(f"Influential words for text: {test_df['text'].iloc[idx]}")
print(influential_words)
但是我在运行上述代码时遇到以下错误。
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/usr/local/lib/python3.10/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
Epoch 1 loss: 0.4719192385673523
Epoch 2 loss: 0.39585667848587036
Epoch 3 loss: 0.14659778773784637
Test Accuracy: 0.70
Input IDs shape: torch.Size([1, 8]) dtype: torch.int64
Attention mask shape: torch.Size([1, 8]) dtype: torch.int64
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-9-f047b509c98d> in <cell line: 90>()
90 for idx, correct in enumerate(correct_predictions):
91 if correct:
---> 92 influential_words = get_influential_words(test_df['text'].iloc[idx], model, tokenizer, ig, device)
93 print(f"Influential words for text: {test_df['text'].iloc[idx]}")
94 print(influential_words)
18 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2549 # remove once script supports set_grad_enabled
2550 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2551 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2552
2553
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)
您需要稍微更改梯度计算类。另外,您没有将forward_func包含到渐变类构造函数中,因此属性方法无法正确启动这些东西。
我认为使用 LayerIntegratedGradients 更适合调试 BERT - 与本教程一致 https://captum.ai/tutorials/Bert_SQUAD_Interpret
请在下面找到有效的片段:
from captum.attr import LayerIntegratedGradients
def custom_forward(inputs):
preds = predict(inputs)
return torch.softmax(preds, dim = 1)[0][1].unsqueeze(-1)
lig = LayerIntegratedGradients(custom_forward, model.bert.embeddings)
def get_influential_words(input_text, model, tokenizer, ig, device):
model.eval()
# Tokenizing the input text
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
# print("Input IDs shape:", input_ids.shape, "dtype:", input_ids.dtype)
# print("Attention mask shape:", attention_mask.shape, "dtype:", attention_mask.dtype)
attributions, delta = lig.attribute(input_ids, return_convergence_delta=True)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
token_importances = attributions.sum(dim=2).squeeze(0).detach().cpu().numpy()
return list(zip(tokens, token_importances))
results = []
for idx, correct in enumerate(correct_predictions):
if correct:
influential_words = get_influential_words(test_df['text'].iloc[idx], model, tokenizer, ig, device)
print(f"Influential words for text: {test_df['text'].iloc[idx]}")
print(influential_words)