我正在尝试使用 1L 数据集微调 llama 3 模型,在训练之间我遇到了下面提到的错误,准确地说,我在完成 30k 步骤后收到此错误。使用 2 个 GPU 进行训练,每个 GPU 容量为 24GB
Traceback (most recent call last):
File "/home/llm04/Ejyle_Sutherland_NLP/finetune/finetune_with_peft.py", line 152, in <module>
trainer.train()
File "/home/llm04/new_venv/lib/python3.10/site-packages/trl/trainer/sft_trainer.py", line 440, in train
output = super().train(*args, **kwargs)
File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 3241, in training_step
torch.cuda.empty_cache()
File "/home/llm04/new_venv/lib/python3.10/site-packages/torch/cuda/memory.py", line 162, in empty_cache
torch._C._cuda_emptyCache()
RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
这是我用于微调的代码
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
pipeline,
logging,
)
from peft import (
LoraConfig,
get_peft_model,
)
import os
import torch
from torch.utils.data import DataLoader
import wandb
import pandas as pd
from datasets import Dataset, load_dataset
from trl import SFTTrainer, setup_chat_format
from huggingface_hub import login
torch.cuda.empty_cache()
# # Insert your Hugging Face token here
hf_token = "token"
# # Login to Hugging Face Hub
login(token=hf_token)
# # Define paths and model parameters
# base_model = "meta-llama/Meta-Llama-3-8B"
base_model = "/home/llm04/Meta-Llama-3-8B-Instruct"
dataset_path = "/home/llm04/Ejyle_Sutherland_NLP/Datasets/R918.xlsx"
new_model = "R918-Test-10"
torch_dtype = torch.bfloat16
attn_implementation = "eager"
# QLoRA config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=bnb_config,
device_map="auto",
attn_implementation=attn_implementation
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)
# LoRA config
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
)
model = get_peft_model(model, peft_config)
# Load and preprocess the dataset
df = pd.read_excel(dataset_path)
dataset = Dataset.from_pandas(df)
#dataset=load_dataset('csv',data_files=dataset_path)
system_prompt = (
"""you are a medical coding expert, who have access to medical coding guidelines. extract all the medical conditions from all the sections including clinical information/condition and its associated ICD-10 descriptions(description only without code) with anatomical locations for the given clinical text of all the sections, exclude negated conditions from the given radiology record and return only the ICD-10 descriptions(description only without code) of non-negated conditions in this json format{"description":[]}, except json do not send anything else."""
)
template_tokenizer= AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
def format_chat_template(row):
user_content = row["Sentence"] if row["Sentence"] is not None else ""
assistant_content = row["Description"] if row["Description"] is not None else ""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content},
{"role": "assistant", "content": assistant_content}]
#print(row_json,"------------------------------------------")
final = template_tokenizer.apply_chat_template(messages, tokenize=False)
#print(final,"----------")
return {'text': final}
#print(format_chat_template(dataset[0]))
print(f"original dataset: {len(dataset)}")
dataset = dataset.map(format_chat_template, remove_columns=dataset.column_names)
print(dataset[0],"---------------------------------map-------------------------------")
print(len(dataset))
dataset = dataset.shuffle(seed=66).select(range(563))
print(f"shuffled dataset: {len(dataset)}")
# Split the dataset into a training and validation set
dataset = dataset.train_test_split(test_size=0.00539)
print(dataset['train'][0],"----------------split-------------------")
# Training parameters
training_arguments = TrainingArguments(
output_dir=new_model,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=2,
optim="paged_adamw_32bit",
num_train_epochs=5,
eval_strategy="epoch",
eval_steps=100, # Change to 10 to avoid too frequent evaluation
logging_steps=50,
warmup_steps=100,
logging_strategy="epoch",
learning_rate=2e-5,
fp16=False,
bf16=True,
group_by_length=True,
load_best_model_at_end=True,
metric_for_best_model="loss",
save_total_limit=3,
save_strategy="epoch",
#max_steps=(len(dataset)//per_device_train_batch_size)*num_train_epochs
# report_to="wandb"
)
device = torch.device("cuda:0")
peft_model= model.to(device)
# Supervised fine-tuning (SFT) trainer
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
peft_config=peft_config,
dataset_text_field="text",
tokenizer=tokenizer,
args=training_arguments,
max_seq_length=1024,
packing=False,
)
trainer.train()
model.config.use_cache = True
trainer.model.save_pretrained(new_model)
# trainer.model.push_to_hub(new_model, use_temp_dir=False)
注意:我在由 2 个 GPU 组成的服务器中运行此代码,我尝试重新启动服务器,创建新的虚拟环境并尝试,但没有任何效果
谢谢提前
您是否尝试过将 model = AutoModelForCausalLM.from_pretrained() 中的 device_mapping = "auto" 更改为 device_mapping = "cuda" ?