这是我的代码,我目前在 4 个 GPU 上运行它
setup(rank, gpus)
dataset = RandomDataset(input_shape, 80*batch_size, rank)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
data_iter = iter(dataloader)
model = model(pretrained=True).to(rank)
optimizer = optim.SGD(model.parameters(), lr=0.0001)
criterion = torch.nn.CrossEntropyLoss()
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
print("[MAKING DDP Model]")
model = DDP(model)
print("[MODEL CREATED]")
for i in range(11):
optimizer.zero_grad(set_to_none=True)
inputs, labels = next(data_iter)
output = model(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
capture_input = torch.empty((batch_size, 3, input_shape, input_shape)).to(rank)
capture_target = torch.argmax(torch.from_numpy(np.eye(1000)[np.random.choice(1000, batch_size)]), axis=1).to(rank)
g = torch.cuda.CUDAGraph()
optimizer.zero_grad(set_to_none=True)
with torch.cuda.graph(g):
capture_y_pred = model(capture_input)
capture_loss = criterion(capture_y_pred, capture_target)
capture_loss.backward()
optimizer.step()
print("RECORDED")
for i in range(20):
inputs, label = next(data_iter)
capture_input.copy_(inputs)
capture_target.copy_(label)
g.replay()
optimizer.step()
print("DATASET DONE")
运行时错误:CUDA 错误:由于捕获期间先前的错误,操作失败 CUDA 内核错误可能会在其他一些 API 调用中异步报告,因此下面的堆栈跟踪可能不正确。 对于调试,请考虑传递 CUDA_LAUNCH_BLOCKING=1。
有谁知道如何解决这个问题吗?
根据官方文档(link),建议在执行全向后捕获之前初始化DDP模型。因此,可以在预热步骤之前创建模型。例如:
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
ddp_model = torch.nn.parallel.DistributedDataParallel(model)
torch.cuda.current_stream().wait_stream(s)