我们学校有一台带有多个 GPU 的 HPC,我正在尝试找出如何利用更多 GPU 来更快地运行我的代码,但是我在批量大小 128 时不断遇到此错误:
Traceback (most recent call last):
File "/home/rgg2706/Multimodal-Sentiment-Analysis/Models/HHMAFM/src/instructor_tests/train2_test.py", line 260, in <module>
ins.run()
File "/home/rgg2706/Multimodal-Sentiment-Analysis/Models/HHMAFM/src/instructor_tests/train2_test.py", line 114, in run
roberta_text_features = self.roberta(**roberta_inputs_text).last_hidden_state[:, 0, :]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/transformers/models/roberta/modeling_roberta.py", line 835, in forward
encoder_outputs = self.encoder(
^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/transformers/models/roberta/modeling_roberta.py", line 524, in forward
layer_outputs = layer_module(
^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/transformers/models/roberta/modeling_roberta.py", line 413, in forward
self_attention_outputs = self.attention(
^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/transformers/models/roberta/modeling_roberta.py", line 340, in forward
self_outputs = self.self(
^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/.autofs/tools/spack/var/spack/environments/default-nlp-x86_64-24072401/.spack-env/view/lib/python3.11/site-packages/transformers/models/roberta/modeling_roberta.py", line 236, in forward
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacty of 39.38 GiB of which 9.38 MiB is free. Including non-PyTorch memory, this process has 39.36 GiB memory in use. Of the allocated memory 38.41 GiB is allocated by PyTorch, and 460.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
一些重要信息:
我训练模型的数据集有 19,600 个文本+图像样本
我正在使用 pytorch 通过网络从预先训练的模型中提供这些特征
我已经使用 1 个 GPU 运行脚本,但需要大约 10 多个小时
批量大小为 64 时,GPU 内存约为 32/40GB
这是我正在使用的讲师课程:
class Instructor:
def __init__(self, opt):
self.opt = opt
self.train_losses = []
print('> training arguments:')
for arg in vars(opt):
print(f'>>> {arg}: {getattr(opt, arg)}')
transform = transforms.Compose([
transforms.RandomCrop(opt.crop_size),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
mvsa_dataset = MVSADatasetReader(transform, dataset=opt.dataset, max_seq_len=opt.max_seq_len, path_image=opt.path_image)
opt.num_classes = mvsa_dataset.num_classes
self.train_data_loader = DataLoader(dataset=mvsa_dataset.train_data, batch_size=opt.batch_size, shuffle=True)
self.dev_data_loader = DataLoader(dataset=mvsa_dataset.dev_data, batch_size=opt.batch_size, shuffle=False)
self.test_data_loader = DataLoader(dataset=mvsa_dataset.test_data, batch_size=opt.batch_size, shuffle=False)
print('building model')
self.roberta = RobertaModel.from_pretrained('roberta-base').to(device)
self.resnet = models.resnet152(pretrained=True).to(device)
self.densenet = models.densenet121(pretrained=True).to(device)
self.model = opt.model_class(opt).to(device)
# Use multiple GPUs if available
if torch.cuda.device_count() > 1:
self.model = nn.DataParallel(self.model)
self.roberta = nn.DataParallel(self.roberta)
self.resnet = nn.DataParallel(self.resnet)
self.densenet = nn.DataParallel(self.densenet)
self.reset_parameters()
这是运行函数:
def run(self):
criterion = nn.CrossEntropyLoss()
params = filter(lambda p: p.requires_grad, self.model.parameters())
optimizer = self.opt.optimizer(params, lr=self.opt.learning_rate)
max_dev_f1, max_test_f1 = 0, 0
for epoch in range(self.opt.num_epoch):
print('>' * 100)
print(f'epoch: {epoch}')
epoch_start_time = time.time()
self.model.train()
for i_batch, sample_batched in enumerate(self.train_data_loader):
batch_start_time = time.time()
optimizer.zero_grad()
input_ids_text = sample_batched['input_ids_text'].to(device)
attention_mask_text = sample_batched['attention_mask_text'].to(device)
input_ids_topic = sample_batched['input_ids_topic'].to(device)
attention_mask_topic = sample_batched['attention_mask_topic'].to(device)
images = sample_batched['image'].to(device)
targets = sample_batched['polarity'].to(device)
print(f"input_ids_text device: {input_ids_text.device}")
resnet_features = self.resnet(images)
densenet_features = self.densenet(images)
roberta_inputs_text = {
'input_ids': input_ids_text,
'attention_mask': attention_mask_text
}
roberta_text_features = self.roberta(**roberta_inputs_text).last_hidden_state[:, 0, :]
roberta_inputs_topic = {
'input_ids': input_ids_topic,
'attention_mask': attention_mask_topic
}
roberta_topic_features = self.roberta(**roberta_inputs_topic).last_hidden_state[:, 0, :]
outputs = self.model(roberta_text_features, roberta_topic_features, resnet_features, densenet_features)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
self.train_losses.append(loss.item())
batch_end_time = time.time()
print(f'Batch {i_batch} completed in {batch_end_time - batch_start_time:.2f} seconds ({(batch_end_time - batch_start_time) / 60:.2f} minutes)')
if i_batch % self.opt.log_step == 0:
dev_acc, dev_f1 = self.evaluate(self.dev_data_loader)
test_acc, test_f1 = self.evaluate(self.test_data_loader)
if dev_f1 > max_dev_f1:
print(f"max_dev_f1: {max_dev_f1}, max_test_f1: {max_test_f1}")
max_dev_f1 = dev_f1
max_test_f1 = test_f1
print(f'loss: {loss.item():.6f}, dev_acc: {dev_acc * 100:.2f}% ({dev_acc:.6f}), dev_f1: {dev_f1 * 100:.2f}% ({dev_f1:.6f}), test_acc: {test_acc * 100:.2f}% ({test_acc:.6f}), test_f1: {test_f1 * 100:.2f}% ({test_f1:.6f})')
epoch_end_time = time.time()
print(f'Epoch {epoch} completed in {epoch_end_time - epoch_start_time:.2f} seconds ({(epoch_end_time - epoch_start_time) / 60:.2f} minutes)')
print(f'Max dev F1: {max_dev_f1 * 100:.2f}% ({max_dev_f1:.6f}), Max test F1: {max_test_f1 * 100:.2f}% ({max_test_f1:.6f})')
self.save_training_loss_plot()
这是运行代码的 Slurm 脚本:
#!/bin/bash -l
#SBATCH --job-name=${MODEL_NAME} # Name of your job
#SBATCH --account=${ACCOUNT} # Your Slurm account
#SBATCH --partition=${PARTITION} # Run on tier3
#SBATCH --time=1-00:00:00 # 4 hours time limit
#SBATCH --nodes=1 # # of nodes
#SBATCH --ntasks=1 # 1 task (i.e. process)
#SBATCH --mem=32g # Increase RAM to 16GB
#SBATCH --gres=gpu:a100:3 # 1 a100 GPU
#SBATCH --output=${OUTPUT_PATH} # Output file
#SBATCH --error=${ERROR_PATH} # Error file
# Load necessary environment
spack env activate nlp
# Run the main script
cd ${REPO_DIR}/
stdbuf -oL -eL bash ${REPO_DIR}/Models/${MODEL_NAME}/run.sh
如果我可以提供更多信息来帮助我解决此问题,请告诉我,我将更新帖子。
谢谢!
我尝试使用一个 GPU 运行模型,效果很好,但需要大约 10 个小时。我们学校有很多 a100 可供使用,所以我想提高性能,因为我训练模型的下一个数据集更大。批量大小为 64 是可行的,但我想将其增加到 128 或更高以加快性能,但似乎内存不足。
您是否尝试过将
#SBATCH --ntasks=1
增加到 2?它应该为您的脚本分配更多的核心。