预计所有张量都在同一个设备上，但发现至少有两个设备，cuda:0 和 cpu - 无法找出问题所在

Question

我正在尝试在 GPU 上训练对象检测模型。代码是用 Pytorch 编写的。存在关于相同错误的查询的现有问题，但不幸的是，没有一个对我有用。

GPU设备声明如下：

device = torch.device("cuda:0")

我的训练循环如下：

detector = TwoStageDetector(img_size, out_size, out_c, n_classes, roi_size)
detector=detector.to(device)
#detector.eval()
#total_loss = detector(img_batch, gt_bboxes_batch, gt_classes_batch)
#proposals_final, conf_scores_final, classes_final = detector.inference(img_batch)

print("STARTING TRAINING")
def training_loop(model, learning_rate, train_dataloader, n_epochs):
    #model=model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    loss_list = []

    for i in tqdm(range(n_epochs)):
        total_loss = 0
        for img_batch, gt_bboxes_batch, gt_classes_batch in train_dataloader:
            img_batch=img_batch.to(device)
            gt_bboxes_batch=gt_bboxes_batch.to(device)
            gt_classes_batch=gt_classes_batch.to(device)
            # forward pass
            loss = model(img_batch, gt_bboxes_batch, gt_classes_batch)
            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        loss_list.append(total_loss)

    return loss_list

learning_rate = 1e-3
n_epochs = 1000

loss_list = training_loop(detector, learning_rate, od_dataloader, n_epochs)

model.py文件中的相关模型类如下：

class TwoStageDetector(nn.Module):
    def __init__(self, img_size, out_size, out_channels, n_classes, roi_size):
        super().__init__() 
        self.rpn = RegionProposalNetwork(img_size, out_size, out_channels)
        self.classifier = ClassificationModule(out_channels, n_classes, roi_size)
        
    def forward(self, images, gt_bboxes, gt_classes):
        total_rpn_loss, feature_map, proposals, \
        positive_anc_ind_sep, GT_class_pos = self.rpn(images, gt_bboxes, gt_classes)
        
        # get separate proposals for each sample
        pos_proposals_list = []
        batch_size = images.size(dim=0)
        for idx in range(batch_size):
            proposal_idxs = torch.where(positive_anc_ind_sep == idx)[0]
            proposals_sep = proposals[proposal_idxs].detach().clone()
            pos_proposals_list.append(proposals_sep)
        
        cls_loss = self.classifier(feature_map, pos_proposals_list, GT_class_pos)
        total_loss = cls_loss + total_rpn_loss
        
        return total_loss
    
    def inference(self, images, conf_thresh=0.5, nms_thresh=0.7):
        batch_size = images.size(dim=0)
        proposals_final, conf_scores_final, feature_map = self.rpn.inference(images, conf_thresh, nms_thresh)
        cls_scores = self.classifier(feature_map, proposals_final)
        
        # convert scores into probability
        cls_probs = F.softmax(cls_scores, dim=-1)
        # get classes with highest probability
        classes_all = torch.argmax(cls_probs, dim=-1)
        
        classes_final = []
        # slice classes to map to their corresponding image
        c = 0
        for i in range(batch_size):
            n_proposals = len(proposals_final[i]) # get the number of proposals for each image
            classes_final.append(classes_all[c: c+n_proposals])
            c += n_proposals
            
        return proposals_final, conf_scores_final, classes_final

get_iou_mat()

文件中的

get_req_anchors()

函数和

utils.py

函数如下：

def get_iou_mat(batch_size, anc_boxes_all, gt_bboxes_all):
    # flatten anchor boxes
    anc_boxes_flat = anc_boxes_all.reshape(batch_size, -1, 4)
    # get total anchor boxes for a single image
    tot_anc_boxes = anc_boxes_flat.size(dim=1)
    # create a placeholder to compute IoUs amongst the boxes
    ious_mat = torch.zeros((batch_size, tot_anc_boxes, gt_bboxes_all.size(dim=1)))

    # compute IoU of the anc boxes with the gt boxes for all the images
    for i in range(batch_size):
        gt_bboxes = gt_bboxes_all[i]
        #gt_bboxes = gt_bboxes[None, :]
        anc_boxes = anc_boxes_flat[i]
        ious_mat[i, :] = ops.box_iou(anc_boxes, gt_bboxes)
    return ious_mat


def get_req_anchors(anc_boxes_all, gt_bboxes_all, gt_classes_all, pos_thresh=0.7, neg_thresh=0.2):
    '''
    Prepare necessary data required for training

    Input
    ------
    anc_boxes_all - torch.Tensor of shape (B, w_amap, h_amap, n_anchor_boxes, 4)
        all anchor boxes for a batch of images
    gt_bboxes_all - torch.Tensor of shape (B, max_objects, 4)
        padded ground truth boxes for a batch of images
    gt_classes_all - torch.Tensor of shape (B, max_objects)
        padded ground truth classes for a batch of images

    Returns
    ---------
    positive_anc_ind -  torch.Tensor of shape (n_pos,)
        flattened positive indices for all the images in the batch
    negative_anc_ind - torch.Tensor of shape (n_pos,)
        flattened positive indices for all the images in the batch
    GT_conf_scores - torch.Tensor of shape (n_pos,), IoU scores of +ve anchors
    GT_offsets -  torch.Tensor of shape (n_pos, 4),
        offsets between +ve anchors and their corresponding ground truth boxes
    GT_class_pos - torch.Tensor of shape (n_pos,)
        mapped classes of +ve anchors
    positive_anc_coords - (n_pos, 4) coords of +ve anchors (for visualization)
    negative_anc_coords - (n_pos, 4) coords of -ve anchors (for visualization)
    positive_anc_ind_sep - list of indices to keep track of +ve anchors
    '''
    # get the size and shape parameters
    B, w_amap, h_amap, A, _ = anc_boxes_all.shape
    N = gt_bboxes_all.shape[1]  # max number of groundtruth bboxes in a batch

    # get total number of anchor boxes in a single image
    tot_anc_boxes = A * w_amap * h_amap

    # get the iou matrix which contains iou of every anchor box
    # against all the groundtruth bboxes in an image
    iou_mat = get_iou_mat(B, anc_boxes_all, gt_bboxes_all)
    #print(iou_mat.shape)
    # for every groundtruth bbox in an image, find the iou
    # with the anchor box which it overlaps the most
    max_iou_per_gt_box, _ = iou_mat.max(dim=1, keepdim=True)
    #print(max_iou_per_gt_box.shape)
    #print(max_iou_per_gt_box)
    # get positive anchor boxes

    # condition 1: the anchor box with the max iou for every gt bbox
    #print(max_iou_per_gt_box > 0)
    positive_anc_mask = torch.logical_and(iou_mat == max_iou_per_gt_box, max_iou_per_gt_box > 0)
    #print(positive_anc_mask.shape)
    # condition 2: anchor boxes with iou above a threshold with any of the gt bboxes
    positive_anc_mask = torch.logical_or(positive_anc_mask, iou_mat > pos_thresh)
    #print(positive_anc_mask.shape)

    positive_anc_ind_sep = torch.where(positive_anc_mask)[0]  # get separate indices in the batch
    # combine all the batches and get the idxs of the +ve anchor boxes
    positive_anc_mask = positive_anc_mask.flatten(start_dim=0, end_dim=1)
    positive_anc_ind = torch.where(positive_anc_mask)[0]

    # for every anchor box, get the iou and the idx of the
    # gt bbox it overlaps with the most
    max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1)
    max_iou_per_anc = max_iou_per_anc.flatten(start_dim=0, end_dim=1)

    # get iou scores of the +ve anchor boxes
    GT_conf_scores = max_iou_per_anc[positive_anc_ind]

    # get gt classes of the +ve anchor boxes

    # expand gt classes to map against every anchor box
    #print(gt_classes_all.shape)
    gt_classes_expand = gt_classes_all.view(B, 1, N).expand(B, tot_anc_boxes, N)
    # for every anchor box, consider only the class of the gt bbox it overlaps with the most
    GT_class = torch.gather(gt_classes_expand, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1)
    # combine all the batches and get the mapped classes of the +ve anchor boxes
    GT_class = GT_class.flatten(start_dim=0, end_dim=1)
    GT_class_pos = GT_class[positive_anc_ind]

    # get gt bbox coordinates of the +ve anchor boxes

    # expand all the gt bboxes to map against every anchor box
    gt_bboxes_expand = gt_bboxes_all.view(B, 1, N, 4).expand(B, tot_anc_boxes, N, 4)
    # for every anchor box, consider only the coordinates of the gt bbox it overlaps with the most
    GT_bboxes = torch.gather(gt_bboxes_expand, -2,
                             max_iou_per_anc_ind.reshape(B, tot_anc_boxes, 1, 1).repeat(1, 1, 1, 4))
    # combine all the batches and get the mapped gt bbox coordinates of the +ve anchor boxes
    GT_bboxes = GT_bboxes.flatten(start_dim=0, end_dim=2)
    GT_bboxes_pos = GT_bboxes[positive_anc_ind]

    # get coordinates of +ve anc boxes
    anc_boxes_flat = anc_boxes_all.flatten(start_dim=0, end_dim=-2)  # flatten all the anchor boxes
    positive_anc_coords = anc_boxes_flat[positive_anc_ind]

    # calculate gt offsets
    GT_offsets = calc_gt_offsets(positive_anc_coords, GT_bboxes_pos)

    # get -ve anchors

    # condition: select the anchor boxes with max iou less than the threshold
    negative_anc_mask = (max_iou_per_anc < neg_thresh)
    negative_anc_ind = torch.where(negative_anc_mask)[0]
    # sample -ve samples to match the +ve samples
    negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (positive_anc_ind.shape[0],))]
    negative_anc_coords = anc_boxes_flat[negative_anc_ind]

    return positive_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, GT_class_pos, \
           positive_anc_coords, negative_anc_coords, positive_anc_ind_sep

我没有明确地将

utils.py

文件中的任何张量放入GPU中。

错误栈跟踪如下：

File "/home/main.py", line 353, in <module>
    loss_list = training_loop(detector, learning_rate, od_dataloader, n_epochs)
  File "/home/main.py", line 336, in training_loop
    loss = model(img_batch, gt_bboxes_batch, gt_classes_batch)
  File "/home/miniconda3/envs/pytor/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/model.py", line 215, in forward
    positive_anc_ind_sep, GT_class_pos = self.rpn(images, gt_bboxes, gt_classes)
  File "/home/miniconda3/envs/pytor/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/model.py", line 104, in forward
    negative_anc_coords, positive_anc_ind_sep = get_req_anchors(anc_boxes_all, gt_bboxes_proj, gt_classes)
  File "/home/utils.py", line 222, in get_req_anchors
    iou_mat = get_iou_mat(B, anc_boxes_all, gt_bboxes_all)
  File "/home/utils.py", line 181, in get_iou_mat
    ious_mat[i, :] = ops.box_iou(anc_boxes, gt_bboxes)
  File "/home/miniconda3/envs/pytor/lib/python3.9/site-packages/torchvision/ops/boxes.py", line 271, in box_iou
    inter, union = _box_inter_union(boxes1, boxes2)
  File "/home/miniconda3/envs/pytor/lib/python3.9/site-packages/torchvision/ops/boxes.py", line 244, in _box_inter_union
    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

我曾尝试向 GPU 发送不同的张量以试图消除此错误，但没有成功。如果有人能指出我做错了什么，那就太好了。谢谢

Answer 1

 lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]

boxes1 在 GPU 零上，boxes2 在 CPU 上。

你能展示你如何定义你的模型吗，

boxes1

和

boxes2

？

你还可以显示你在哪里定义

device

吗？

预计所有张量都在同一个设备上，但发现至少有两个设备，cuda:0 和 cpu - 无法找出问题所在

问题描述投票：0回答：1

1个回答

最新问题

预计所有张量都在同一个设备上，但发现至少有两个设备，cuda:0 和 cpu - 无法找出问题所在

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1