TORCH_USE_CUDA_DSA 启用设备端断言

Question

我正在尝试使用 1L 数据集微调 llama 3 模型，在训练之间我遇到了下面提到的错误，准确地说，我在完成 30k 步骤后收到此错误。使用 2 个 GPU 进行训练，每个 GPU 容量为 24GB

Traceback (most recent call last):
  File "/home/llm04/Ejyle_Sutherland_NLP/finetune/finetune_with_peft.py", line 152, in <module>
    trainer.train()
  File "/home/llm04/new_venv/lib/python3.10/site-packages/trl/trainer/sft_trainer.py", line 440, in train
    output = super().train(*args, **kwargs)
  File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 3241, in training_step
    torch.cuda.empty_cache()
  File "/home/llm04/new_venv/lib/python3.10/site-packages/torch/cuda/memory.py", line 162, in empty_cache
    torch._C._cuda_emptyCache()
RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

这是我用于微调的代码

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import os
import torch
from torch.utils.data import DataLoader
import wandb
import pandas as pd
from datasets import Dataset, load_dataset
from trl import SFTTrainer, setup_chat_format
from huggingface_hub import login

torch.cuda.empty_cache()

# # Insert your Hugging Face token here
hf_token = "token"

# # Login to Hugging Face Hub
login(token=hf_token)

# # Define paths and model parameters
# base_model = "meta-llama/Meta-Llama-3-8B"
base_model = "/home/llm04/Meta-Llama-3-8B-Instruct"
dataset_path = "/home/llm04/Ejyle_Sutherland_NLP/Datasets/R918.xlsx"
new_model = "R918-Test-10"
torch_dtype = torch.bfloat16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

    

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
)
model = get_peft_model(model, peft_config)
# Load and preprocess the dataset
df = pd.read_excel(dataset_path)
dataset = Dataset.from_pandas(df)
#dataset=load_dataset('csv',data_files=dataset_path)

system_prompt = (
    """you are a medical coding expert, who have access to medical coding guidelines. extract all the medical conditions from all the sections including clinical information/condition and its associated ICD-10 descriptions(description only without code) with anatomical locations for the given clinical text of all the sections, exclude negated conditions from the given radiology record and return only the ICD-10 descriptions(description only without code) of non-negated conditions in this json format{"description":[]}, except json do not send anything else."""
)

template_tokenizer= AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")

def format_chat_template(row):
    user_content = row["Sentence"] if row["Sentence"] is not None else ""
    assistant_content = row["Description"] if row["Description"] is not None else ""
    messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}]
    #print(row_json,"------------------------------------------")
    final = template_tokenizer.apply_chat_template(messages, tokenize=False)
    #print(final,"----------")   
    return {'text': final}

#print(format_chat_template(dataset[0]))
print(f"original dataset: {len(dataset)}")
dataset = dataset.map(format_chat_template, remove_columns=dataset.column_names)
print(dataset[0],"---------------------------------map-------------------------------")
print(len(dataset))
dataset = dataset.shuffle(seed=66).select(range(563))

print(f"shuffled dataset: {len(dataset)}")

# Split the dataset into a training and validation set
dataset = dataset.train_test_split(test_size=0.00539)
print(dataset['train'][0],"----------------split-------------------")
# Training parameters
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=5,
    eval_strategy="epoch",
    eval_steps=100,  # Change to 10 to avoid too frequent evaluation
    logging_steps=50,
    warmup_steps=100,
    logging_strategy="epoch",
    learning_rate=2e-5,
    fp16=False,
    bf16=True,
    group_by_length=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=3,
    save_strategy="epoch",
    #max_steps=(len(dataset)//per_device_train_batch_size)*num_train_epochs
    # report_to="wandb"
)
device = torch.device("cuda:0")
peft_model= model.to(device)
# Supervised fine-tuning (SFT) trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=1024,
    packing=False,
    )

trainer.train()


model.config.use_cache = True
trainer.model.save_pretrained(new_model)
# trainer.model.push_to_hub(new_model, use_temp_dir=False)

注意：我在由 2 个 GPU 组成的服务器中运行此代码，我尝试重新启动服务器，创建新的虚拟环境并尝试，但没有任何效果

谢谢提前

Answer 1

您是否尝试过将 model = AutoModelForCausalLM.from_pretrained() 中的 device_mapping = "auto" 更改为 device_mapping = "cuda" ？

TORCH_USE_CUDA_DSA 启用设备端断言

问题描述投票：0回答：1

1个回答

最新问题

TORCH_USE_CUDA_DSA 启用设备端断言

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1