我正在编写一个 Python 代码,用于训练分类器对样本进行分类(每个样本 10 个句子)。我使用

附加层 并在 Linux 服务器上运行模型训练。代码如下。重要的部分是代码的最后一部分,特别是在拟合模型时。

import math
import logging
from datetime import datetime
import pandas as pd
import numpy as np
import sys
import os
import csv
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from torch.utils.data import DataLoader 
from collections import Counter
from LabelAccuracyEvaluator import *
from SoftmaxLoss import *
from layers import Dense, MultiHeadAttention
from sklearn.utils import resample
import torch
import random
import json

model_name = sys.argv[1] if len(sys.argv) > 1 else 'distilroberta-base'

train_batch_size = 8

model_save_path = 'Slashdot/output/gascom_hate_attention_' + model_name.replace("/", "-") # this is the line for saving the model you need for random walks

word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),

dense_model = Dense.Dense(in_features=3*760, out_features=6) #called last , u, v, u-v
multihead_attn = MultiHeadAttention.MultiHeadAttention(760, 5, batch_first=True)

# idea is every attention head should be learning something new and that is why you need different q,k, and v. Now I understand!
linear_proj_q = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_k = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_v = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760)
linear_proj_node = Dense.Dense(word_embedding_model.get_word_embedding_dimension(), 760) #760 to 760

model = SentenceTransformer(modules=[word_embedding_model, multihead_attn, dense_model, linear_proj_q, linear_proj_k, linear_proj_v, linear_proj_node])
model_uv = SentenceTransformer(modules=[word_embedding_model, pooling_model])# w?

train_samples = []
test_samples = []

# Load and clean training dataset
trainset = pd.read_csv('Slashdot/random-walks/S_train_simil_random_walk.csv')
trainset = trainset.fillna('')

# Create a label mapping: Map each unique string label to an integer
unique_labels = trainset['label'].unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# Process train set and convert string labels to integer labels using the mapping
for i in range(len(trainset)):
    texts = []
    for j in range(1, 11):
        texts.append(trainset.iloc[i]['sent' + str(j)])
    # Convert string label to integer using the mapping
    label = label_mapping[trainset.iloc[i]['label']]
    train_samples.append(InputExample(texts=texts, label=label))

# Split into train and dev sets (80/20 split)
dev_samples = train_samples[math.ceil(0.8 * len(train_samples)):]
train_samples = train_samples[:math.ceil(0.8 * len(train_samples))]

# Load and clean test dataset
testset = pd.read_csv('Slashdot/random-walks/S_test_simil_random_walk.csv')
testset = testset.fillna('')

# Convert string labels to integer labels using the same mapping for the test set
for i in range(len(testset)):
    texts = []
    for j in range(1, 11):
        texts.append(testset.iloc[i]['sent' + str(j)])
    # Convert string label to integer using the same mapping
    label = label_mapping[testset.iloc[i]['label']]
    test_samples.append(InputExample(texts=texts, label=label))

# Count the number of samples for each numerical category (label)
train_labels = [example.label for example in train_samples]
dev_labels =[example.label for example in dev_samples]
test_labels = [example.label for example in test_samples]

# Count occurrences of each label in the train, valid, and test sets
train_label_count = Counter(train_labels)
dev_label_count = Counter(dev_labels)
test_label_count = Counter(test_labels)

# Print the counts for each label
print("Label mapping (string to integer):", label_mapping)
print("Initial Train set label distribution:", train_label_count)
print("Initial Valid set label distribution:", dev_label_count)
print("Initial Test set label distribution:", test_label_count)

print('length of train samples=', len(train_samples))
print('length of dev samples=', len(dev_samples))
print('length of test samples=', len(test_samples))

#BALANCING DATASET-------------------------------------------------BALANCING DATASET----------------------------------------------------
# Load the synonym dictionary from the JSON file
with open('Slashdot/synonym_dic.json', 'r') as f:
    synonym_dict = json.load(f)

def get_synonyms(word):
    """Get synonyms from the pre-defined dictionary."""
    return synonym_dict.get(word.lower(), [])

def replace_with_synonyms(sentence, num_replacements=2):
    """Replace words with synonyms using a hardcoded dictionary, preserving punctuation."""
    words = sentence.split()
    new_words = []

    for word in words:
        # Capture punctuation to reattach it after replacement
        prefix = ""
        suffix = ""
        # Check and remove leading punctuation
        while word and word[0] in '.,!?':
            prefix += word[0]
            word = word[1:]

        # Check and remove trailing punctuation
        while word and word[-1] in '.,!?':
            suffix += word[-1]
            word = word[:-1]

        clean_word = word  # word without punctuation

        # Skip words that don't have a good replacement
        if len(clean_word) < 4:
            new_words.append(prefix + clean_word + suffix)

        # Get synonyms using the dictionary
        synonyms = get_synonyms(clean_word)

        if synonyms:
            # Replace the word with a random synonym
            replacement = random.choice(synonyms)
            # Maintain the original case
            if clean_word[0].isupper():
                replacement = replacement.capitalize()
            new_words.append(prefix + replacement + suffix)
            # Uncomment to debug replacement
            #print(clean_word, 'replaced with', replacement)
            new_words.append(prefix + clean_word + suffix)
    return ' '.join(new_words)

def augment_sample(sample, num_augments=1):
    """Augment sample sentences using the hardcoded synonym dictionary."""
    augmented_samples = []
    for _ in range(num_augments):
        new_texts = []
        for sentence in sample.texts:
            #print('**SENTENCE:', sentence)
            new_sentence = replace_with_synonyms(sentence)
            #print('**NEW SENTENCE:', new_sentence)
        augmented_samples.append(InputExample(texts=new_texts, label=sample.label))
    return augmented_samples

def oversample_to_balance(label_count,samples,dataset_name):
    # Oversample to balance classes
    max_count = max(label_count.values())
    balanced_samples = []
    for label, count in label_count.items():
        label_samples = [sample for sample in samples if sample.label == label]
        if count < max_count:
            augment_count = max_count - count
            aug_samples = [augment_sample(sample)[0] for sample in resample(label_samples, n_samples=augment_count)]
    return balanced_samples

# Update the samples with the balanced set
train_samples = oversample_to_balance(train_label_count,train_samples,'Train')
dev_samples = oversample_to_balance(dev_label_count,dev_samples,'Dev')
test_samples = oversample_to_balance(test_label_count,test_samples,'Test')

train_label_count = Counter([sample.label for sample in train_samples])
dev_label_count = Counter([sample.label for sample in dev_samples])
test_label_count = Counter([sample.label for sample in test_samples])

print("Balanced Train set label distribution:", train_label_count)
print("Balanced Dev set label distribution:", dev_label_count)
print("Balanced Test set label distribution:", test_label_count)

print('length of train samples=', len(train_samples))
print('length of dev samples=', len(dev_samples))
print('length of test samples=', len(test_samples))

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
dev_dataloader = DataLoader(dev_samples, shuffle=True, batch_size=train_batch_size)
test_dataloader = DataLoader(test_samples, shuffle=True, batch_size=train_batch_size)

# Ensure that CUDA is available and get the device name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('CUDA Available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU in use:', torch.cuda.get_device_name(0))

# You can check memory usage like this:
if torch.cuda.is_available():
    print(f"Allocated GPU Memory: {torch.cuda.memory_allocated()} bytes")
    print(f"Cached GPU Memory: {torch.cuda.memory_reserved()} bytes")

#############################################GPU Check########################################################

print(f"Total training samples: {len(train_samples)}")

for i in range(1):
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
    train_loss = SoftmaxLoss(model=model, model_uv=model_uv, multihead_attn=multihead_attn, linear_proj_q=linear_proj_q,
        linear_proj_k=linear_proj_k, linear_proj_v=linear_proj_v, linear_proj_node=linear_proj_node,

    dev_evaluator = LabelAccuracyEvaluator(dev_dataloader, name='sts-dev', softmax_model=train_loss)

    num_epochs = 3
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up, weight initialised randomly I can check that

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
        evaluation_steps=1000, # after 1000 examples the evaluation will happen on the validation set (development).

test_evaluator = LabelAccuracyEvaluator(test_dataloader, name='sts-test', softmax_model=train_loss)
test_evaluator(model, output_path=model_save_path)


Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
  0%|                                                                                                                             | 0/19638 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/home/zaid/GASCOM-main/Slashdot/gascom_train.py", line 250, in <module>
    model.fit(train_objectives=[(train_dataloader, train_loss)],
  File "/home/zaid/.local/lib/python3.10/site-packages/sentence_transformers/fit_mixin.py", line 374, in fit
  File "/home/zaid/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2052, in train
    return inner_training_loop(
  File "/home/zaid/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2388, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/zaid/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3485, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/zaid/.local/lib/python3.10/site-packages/sentence_transformers/trainer.py", line 344, in compute_loss
    loss = loss_fn(features, labels)
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/zaid/GASCOM-main/Slashdot/SoftmaxLoss.py", line 78, in forward
    reps = [self.model.module[0](sentence_feature)['token_embeddings'] for sentence_feature in sentence_features]
  File "/home/zaid/GASCOM-main/Slashdot/SoftmaxLoss.py", line 78, in <listcomp>
    reps = [self.model.module[0](sentence_feature)['token_embeddings'] for sentence_feature in sentence_features]
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/sentence_transformers/models/Transformer.py", line 350, in forward
    output_states = self.auto_model(**trans_features, **kwargs, return_dict=False)
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 912, in forward
    embedding_output = self.embeddings(
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/zaid/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 125, in forward
    embeddings = inputs_embeds + token_type_embeddings
RuntimeError: r.nvmlDeviceGetNvLinkRemoteDeviceType_ INTERNAL ASSERT FAILED at "../c10/cuda/driver_api.cpp":33, please report a bug to PyTorch. Can't find nvmlDeviceGetNvLinkRemoteDeviceType: /lib/x86_64-linux-gnu/libnvidia-ml.so.1: undefined symbol: nvmlDeviceGetNvLinkRemoteDeviceType
TL;DR:升级 CUDA 驱动程序版本 >= 470.42.01 或降级 pytorch 版本 <= 2.2.2


说明:pytorch commit 6e1ba79b7fdf3d66db8fb69462fb502e5006e5e7将以下几行添加到随版本2.3.0发布的pytorch/c10/cuda/driver_api.h中:

#define C10_NVML_DRIVER_API(_)           \
  _(nvmlDeviceGetNvLinkRemoteDeviceType) \
  _(nvmlDeviceGetNvLinkRemotePciInfo_v2) \

但是,nvmlDeviceGetNvLinkRemoteDeviceType470.42.01中被添加到nvml中,而其他需要的nvml函数不晚于450.51添加到nvml中。因此,任何具有 CUDA 驱动程序版本 [450.51, 470.42.01) 且 pytorch 版本 >=2.3.0 的系统在 pytorch 尝试使用 nvlink 时都会报告错误。

