我正在编写一个 Python 代码,用于训练分类器对样本进行分类(每个样本 10 个句子)。我使用
Sentence_Transformer
和 附加层 并在 Linux 服务器上运行模型训练。代码如下。重要的部分是代码的最后一部分,特别是在拟合模型时。
import math
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import sys
import os
import csv
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from torch.utils.data import DataLoader
from collections import Counter
from LabelAccuracyEvaluator import *
from SoftmaxLoss import *
from layers import Dense, MultiHeadAttention
from sklearn.utils import resample
import torch
import random
import json
model_name = sys.argv[1] if len(sys.argv) > 1 else 'distilroberta-base'
train_batch_size = 8
model_save_path = 'Slashdot/output/gascom_hate_attention_' + model_name.replace("/", "-") # this is the line for saving the model you need for random walks
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
dense_model = Dense.Dense(in_features=3*760, out_features=6) #called last , u, v, u-v
multihead_attn = MultiHeadAttention.MultiHeadAttention(760, 5, batch_first=True)
# idea is every attention head should be learning something new and that is why you need different q,k, and v. Now I understand!
linear_proj_q = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_k = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_v = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_node = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760) #760 to 760
model = SentenceTransformer(modules=[word_embedding_model, multihead_attn, dense_model, linear_proj_q, linear_proj_k, linear_proj_v, linear_proj_node])
model_uv = SentenceTransformer(modules=[word_embedding_model, pooling_model])# w?
train_samples = []
test_samples = []
# Load and clean training dataset
trainset = pd.read_csv('Slashdot/random-walks/S_train_simil_random_walk.csv')
trainset = trainset.fillna('')
# Create a label mapping: Map each unique string label to an integer
unique_labels = trainset['label'].unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
# Process train set and convert string labels to integer labels using the mapping
for i in range(len(trainset)):
texts = []
for j in range(1, 11):
texts.append(trainset.iloc[i]['sent' + str(j)])
# Convert string label to integer using the mapping
label = label_mapping[trainset.iloc[i]['label']]
train_samples.append(InputExample(texts=texts, label=label))
# Split into train and dev sets (80/20 split)
dev_samples = train_samples[math.ceil(0.8 * len(train_samples)):]
train_samples = train_samples[:math.ceil(0.8 * len(train_samples))]
# Load and clean test dataset
testset = pd.read_csv('Slashdot/random-walks/S_test_simil_random_walk.csv')
testset = testset.fillna('')
# Convert string labels to integer labels using the same mapping for the test set
for i in range(len(testset)):
texts = []
for j in range(1, 11):
texts.append(testset.iloc[i]['sent' + str(j)])
# Convert string label to integer using the same mapping
label = label_mapping[testset.iloc[i]['label']]
test_samples.append(InputExample(texts=texts, label=label))
# Count the number of samples for each numerical category (label)
train_labels = [example.label for example in train_samples]
dev_labels =[example.label for example in dev_samples]
test_labels = [example.label for example in test_samples]
# Count occurrences of each label in the train, valid, and test sets
train_label_count = Counter(train_labels)
dev_label_count = Counter(dev_labels)
test_label_count = Counter(test_labels)
# Print the counts for each label
print("Label mapping (string to integer):", label_mapping)
print("Initial Train set label distribution:", train_label_count)
print("Initial Valid set label distribution:", dev_label_count)
print("Initial Test set label distribution:", test_label_count)
print('length of train samples=', len(train_samples))
print('length of dev samples=', len(dev_samples))
print('length of test samples=', len(test_samples))
#BALANCING DATASET-------------------------------------------------BALANCING DATASET----------------------------------------------------
# Load the synonym dictionary from the JSON file
with open('Slashdot/synonym_dic.json', 'r') as f:
synonym_dict = json.load(f)
def get_synonyms(word):
"""Get synonyms from the pre-defined dictionary."""
return synonym_dict.get(word.lower(), [])
def replace_with_synonyms(sentence, num_replacements=2):
"""Replace words with synonyms using a hardcoded dictionary, preserving punctuation."""
words = sentence.split()
new_words = []
for word in words:
# Capture punctuation to reattach it after replacement
prefix = ""
suffix = ""
# Check and remove leading punctuation
while word and word[0] in '.,!?':
prefix += word[0]
word = word[1:]
# Check and remove trailing punctuation
while word and word[-1] in '.,!?':
suffix += word[-1]
word = word[:-1]
clean_word = word # word without punctuation
# Skip words that don't have a good replacement
if len(clean_word) < 4:
new_words.append(prefix + clean_word + suffix)
continue
# Get synonyms using the dictionary
synonyms = get_synonyms(clean_word)
if synonyms:
# Replace the word with a random synonym
replacement = random.choice(synonyms)
# Maintain the original case
if clean_word[0].isupper():
replacement = replacement.capitalize()
new_words.append(prefix + replacement + suffix)
# Uncomment to debug replacement
#print(clean_word, 'replaced with', replacement)
else:
new_words.append(prefix + clean_word + suffix)
return ' '.join(new_words)
def augment_sample(sample, num_augments=1):
"""Augment sample sentences using the hardcoded synonym dictionary."""
augmented_samples = []
for _ in range(num_augments):
new_texts = []
for sentence in sample.texts:
#print('**SENTENCE:', sentence)
new_sentence = replace_with_synonyms(sentence)
new_texts.append(new_sentence)
#print('**NEW SENTENCE:', new_sentence)
#print('----------------------------------------------------------')
augmented_samples.append(InputExample(texts=new_texts, label=sample.label))
return augmented_samples
def oversample_to_balance(label_count,samples,dataset_name):
# Oversample to balance classes
print('Balancing',dataset_name,'data:')
max_count = max(label_count.values())
balanced_samples = []
for label, count in label_count.items():
label_samples = [sample for sample in samples if sample.label == label]
if count < max_count:
print('balancing',label,'from',count,'to',max_count,'...')
augment_count = max_count - count
aug_samples = [augment_sample(sample)[0] for sample in resample(label_samples, n_samples=augment_count)]
balanced_samples.extend(aug_samples)
print('balanced')
balanced_samples.extend(label_samples)
return balanced_samples
# Update the samples with the balanced set
train_samples = oversample_to_balance(train_label_count,train_samples,'Train')
dev_samples = oversample_to_balance(dev_label_count,dev_samples,'Dev')
test_samples = oversample_to_balance(test_label_count,test_samples,'Test')
train_label_count = Counter([sample.label for sample in train_samples])
dev_label_count = Counter([sample.label for sample in dev_samples])
test_label_count = Counter([sample.label for sample in test_samples])
print("Balanced Train set label distribution:", train_label_count)
print("Balanced Dev set label distribution:", dev_label_count)
print("Balanced Test set label distribution:", test_label_count)
print('length of train samples=', len(train_samples))
print('length of dev samples=', len(dev_samples))
print('length of test samples=', len(test_samples))
#----------------------------------------------------------------------------------------------------------------------------------------
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_samples, shuffle=True, batch_size=train_batch_size)
test_dataloader = DataLoader(test_samples, shuffle=True, batch_size=train_batch_size)
# Ensure that CUDA is available and get the device name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('CUDA Available:', torch.cuda.is_available())
if torch.cuda.is_available():
print('GPU in use:', torch.cuda.get_device_name(0))
# You can check memory usage like this:
if torch.cuda.is_available():
print(f"Allocated GPU Memory: {torch.cuda.memory_allocated()} bytes")
print(f"Cached GPU Memory: {torch.cuda.memory_reserved()} bytes")
#############################################GPU Check########################################################
print(f"Total training samples: {len(train_samples)}")
for i in range(1):
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = SoftmaxLoss(model=model, model_uv=model_uv, multihead_attn=multihead_attn, linear_proj_q=linear_proj_q,
linear_proj_k=linear_proj_k, linear_proj_v=linear_proj_v, linear_proj_node=linear_proj_node,
sentence_embedding_dimension=pooling_model.get_sentence_embedding_dimension(),
num_labels=6)
dev_evaluator = LabelAccuracyEvaluator(dev_dataloader, name='sts-dev', softmax_model=train_loss)
num_epochs = 3
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up, weight initialised randomly I can check that
print('fitting...')
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=dev_evaluator,
epochs=num_epochs,
evaluation_steps=1000, # after 1000 examples the evaluation will happen on the validation set (development).
warmup_steps=warmup_steps,
output_path=model_save_path
)
test_evaluator = LabelAccuracyEvaluator(test_dataloader, name='sts-test', softmax_model=train_loss)
test_evaluator(model, output_path=model_save_path)
当我运行代码时,出现以下错误:
fitting...
Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
0%| | 0/19638 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/zaid/GASCOM-main/Slashdot/gascom_train.py", line 250, in <module>
model.fit(train_objectives=[(train_dataloader, train_loss)],
File "/home/zaid/.local/lib/python3.10/site-packages/sentence_transformers/fit_mixin.py", line 374, in fit
trainer.train()
File "/home/zaid/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2052, in train
return inner_training_loop(
File "/home/zaid/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/zaid/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3485, in training_step
loss = self.compute_loss(model, inputs)
File "/home/zaid/.local/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 344, in compute_loss
loss = loss_fn(features, labels)
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zaid/GASCOM-main/Slashdot/SoftmaxLoss.py", line 78, in forward
reps = [self.model.module[0](sentence_feature)['token_embeddings'] for sentence_feature in sentence_features]
File "/home/zaid/GASCOM-main/Slashdot/SoftmaxLoss.py", line 78, in <listcomp>
reps = [self.model.module[0](sentence_feature)['token_embeddings'] for sentence_feature in sentence_features]
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/sentence_transformers/models/Transformer.py", line 350, in forward
output_states = self.auto_model(**trans_features, **kwargs, return_dict=False)
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 912, in forward
embedding_output = self.embeddings(
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/zaid/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 125, in forward
embeddings = inputs_embeds + token_type_embeddings
RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "../c10/cuda/driver_api.cpp":33, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib/x86_64-linux-gnu/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType
此问题是由于CUDA驱动版本与pytorch版本不匹配导致的。
TL;DR:升级 CUDA 驱动程序版本 >= 470.42.01 或降级 pytorch 版本 <= 2.2.2。
我在尝试在旧机器上运行变压器模型时遇到了这个问题,我没有权利升级驱动程序。我在网上没有找到任何详细的解释,所以我不妨研究一下并自己写一个。
说明:pytorch commit 6e1ba79b7fdf3d66db8fb69462fb502e5006e5e7将以下几行添加到随版本2.3.0发布的pytorch/c10/cuda/driver_api.h中:
#define C10_NVML_DRIVER_API(_) \
_(nvmlDeviceGetNvLinkRemoteDeviceType) \
_(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
但是,nvmlDeviceGetNvLinkRemoteDeviceType在470.42.01中被添加到nvml中,而其他需要的nvml函数不晚于450.51添加到nvml中。因此,任何具有 CUDA 驱动程序版本 [450.51, 470.42.01) 且 pytorch 版本 >=2.3.0 的系统在 pytorch 尝试使用 nvlink 时都会报告错误。