IndexError:当尝试使用 Hugginface 从微调模型进行预测时,列表索引超出范围

问题描述 投票:0回答:1

我正在尝试学习如何微调预训练模型并使用它。这是我的代码

from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch

# Define a simple accuracy metric
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == labels).mean()}

# Load the dataset
dataset = load_dataset("imdb", split='train[:1%]')
small_train_dataset = dataset.train_test_split(test_size=0.1)['train']
small_eval_dataset = dataset.train_test_split(test_size=0.1)['test']

# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
validation_results = trainer.evaluate()
print(validation_results)

现在,我正在尝试对微调模型进行预测,就像这样

inputs=tokenizer(dataset[0]['text'], padding="max_length", truncation=True,return_tensors="pt")
predictions = trainer.predict(test_dataset=inputs)

当我尝试做出预测时,出现此错误,

IndexError Traceback(最近一次调用最后一次)Cell In[8],第 7 行 3 输入=标记器(数据集[0][‘文本’],填充=“max_length”, truncation=True,return_tensors=“pt”) 6 # 进行预测 ----> 7 个预测 = trainer.predict(test_dataset=inputs)

文件 C:\Python311\Lib\site-packages ransformers rainer.py:3305,位于 Trainer.predict(self, test_dataset,ignore_keys, metric_key_prefix) 第3302章 第3304章 self.args.use_legacy_prediction_loop else self.evaluation_loop → 3305 输出= eval_loop(3306 test_dataloader,描述=“预测”, 第3307章 3307 第3308章 self.args.world_size 3309 如果 f“{metric_key_prefix}_jit_compilation_time”在output.metrics中:

文件 C:\Python311\Lib\site-packages ransformers rainer.py:3408,位于 Trainer.evaluation_loop(自我,数据加载器,描述, [第 3406 章] observed_num_examples = 0 3407 # 主评估循环 → 3408 for 步骤, input in enumerate(dataloader): 3409 # 更新观察到的数字 第3410章 3411、第3411章 observed_batch_size 不是 None:

文件 C:\Python311\Lib\site-packages ccelerate\data_loader.py:454,位于 DataLoaderShard.iter(self) 452 # 我们提前迭代一批来检查 当我们结束时 453 尝试: → 454 current_batch = next(dataloader_iter) 455 除了 StopIteration: 456 产量

文件 C:\Python311\Lib\site-packages orch\utils\data\dataloader.py:631,在 _BaseDataLoaderIter.next(self) 628 if self._sampler_iter is None: 629 # TODO(mypy 发现的数据加载迭代器中的错误 · Issue #76750 · pytorch/pytorch · GitHub) 630 self._reset() # type:ignore[call-arg ] → 631 数据 = self._next_data() 632 self._num_yielded += 1 633 if self._dataset_kind == _DatasetKind.Iterable 和 634 self._IterableDataset_len_called 不是 None 且 635 self._num_yielded

self._IterableDataset_len_叫:

文件 C:\Python311\Lib\site-packages orch\utils\data\dataloader.py:675,在 _SingleProcessDataLoaderIter._next_data(self) 673 def _next_data(self): 674 index = self._next_index() # 可能引发 StopIteration → 675 data = self._dataset_fetcher.fetch(index) # 可能 如果 self._pin_memory: 677 data = 则引发 StopIteration 676 _utils.pin_memory.pin_memory(数据, self._pin_memory_device)

文件 C:\Python311\Lib\site-packages orch\utils\data_utils etch.py:51,在 _MapDatasetFetcher.fetch(self, possible_batched_index) 49 data = self.dataset.getitems(possible_batched_index) 50 else: —> 51 data = [self.dataset[idx] for idx in possible_batched_index] 52 else: 53 数据 = self.dataset[可能_batched_index]

文件 C:\Python311\Lib\site-packages orch\utils\data_utils etch.py:51,在 (.0) 49 数据 = self.dataset.getitems(possible_batched_index) 50 其他: —> 51 data = [self.dataset[idx] for idx in possible_batched_index] 52 否则: 53 数据 = self.dataset[possible_batched_index]

文件 C:\Python311\Lib\site-packages ransformers okenization_utils_base.py:255, 在 BatchEncoding.getitem(self, item) 253 中返回 self.data[item] 254 elif self._encodings 不是 None: → 255 返回 self._encodings[item] 256 elif isinstance(item, slice): 257 return {键: self.data[key][item] 用于 self.data.keys() 中的键}

IndexError:列表索引超出范围

nlp huggingface-transformers huggingface fine-tuning
1个回答
0
投票

您遇到的错误是因为 trainer.predict 方法需要数据集作为输入,但您传递的是已标记化为张量的单个示例。

要对单个输入进行预测,您需要像训练前准备数据集一样进行准备,然后直接使用模型进行预测。

以下是如何修改代码以对单个输入进行预测:

  1. 正确准备输入
  2. 直接使用模型进行预测

这是修改后的代码:


from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch

# Define a simple accuracy metric
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == labels).mean()}

# Load the dataset
dataset = load_dataset("imdb", split='train[:1%]')
small_train_dataset = dataset.train_test_split(test_size=0.1)['train']
small_eval_dataset = dataset.train_test_split(test_size=0.1)['test']

# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
validation_results = trainer.evaluate()
print(validation_results)

# Make a prediction on a single input
inputs = tokenizer(dataset[0]['text'], padding="max_length", truncation=True, return_tensors="pt")
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

print(f"Predicted label: {predictions.item()}")
© www.soinside.com 2019 - 2024. All rights reserved.