使用 PyTorchVideo 加载用于训练视频分类模型的动力学数据集时出错

Question

对于基于 Kinetics 数据集的视频分类任务，我按照 this page 中的示例来训练我的模型。数据集具有以下架构：

正如网页中提到的，train.csv 和 validation.csv 是必要的。遵循本说明。所以，我准备了如下结构的 csv 文件：

不幸的是，我遇到了以下错误：

ValueError: invalid literal for int() with base 10:'Dancing'

所以，我按照 this page 中的说明进行操作，在这方面，我将不同标签文件夹中的所有视频复制到一个主文件夹中，而不是在 csv 文件中使用文本标签，我将数字作为每个标签，如下所示:

但在这种情况下，我收到以下错误：

Exception has occurred: RuntimeError
Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/miniconda3/envs/new/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/miniconda3/envs/new/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
    data.append(next(self.dataset_iter))
  File "/home/miniconda3/envs/new/lib/python3.10/site-packages/pytorchvideo/data/labeled_video_dataset.py", line 225, in __next__
    raise RuntimeError(
RuntimeError: Failed to load video after 10 retries.
  File "/home/main/light_vid_class/training.py", line 199, in train
    trainer.fit(classification_module, data_module)
  File "/home/main/light_vid_class/training.py", line 204, in <module>
    train()

这是我的剧本：

import os
import pytorch_lightning
import pytorchvideo.data
import torch.utils.data
import torch.nn as nn
import torch

from Spatial_Encoder import Spatial_Encoder as model

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"   
     
########################################### Transformations #######################################
from pytorchvideo.transforms import (
ApplyTransformToKey,
Normalize,
RandomShortSideScale,
RemoveKey,
ShortSideScale,
UniformTemporalSubsample)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip
)

##########################################################################################
class KineticsDataModule(pytorch_lightning.LightningDataModule):
    def __init__(self):
        super(KineticsDataModule, self).__init__()

       # Dataset configuration
        #self._DATA_PATH = "/mnt/storage_6TB/user/kinetics400"
        self._CLIP_DURATION = 2  # Duration of sampled clip for each video
        self._BATCH_SIZE = 16
        self._NUM_WORKERS = 8  # Number of parallel processes fetching data
# ...

    def train_dataloader(self):
        """
        Create the Kinetics train partition from the list of video labels
        in {self._DATA_PATH}/train.csv. Add transform that subsamples and
        normalizes the video before applying the scale, crop and flip augmentations.
        """
        train_transform = Compose(
            [
            ApplyTransformToKey(
              key="video",
              transform=Compose(
                  [
                    UniformTemporalSubsample(8),
                    Lambda(lambda x: x / 255.0),
                    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                    RandomShortSideScale(min_size=224, max_size=224),
                    RandomCrop(244),
                    RandomHorizontalFlip(p=0.5),
                  ]
                ),
              ),
            ]
        )
        train_dataset = pytorchvideo.data.Kinetics(
            data_path=os.path.join("/mnt/storage_6TB/user/kinetics400/train_subdivision", "my_train.csv"),
            clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
            transform=train_transform
        )
        return torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self._BATCH_SIZE,
            num_workers=self._NUM_WORKERS,
        )





    def val_dataloader(self):
          """
          Create the Kinetics val partition from the list of video labels
          in {self._DATA_PATH}/val.csv. Add transform that subsamples and
          normalizes the video before applying the scale, crop and flip augmentations.
          """
          val_transform = Compose(
              [
              ApplyTransformToKey(
                key="video",
                transform=Compose(
                    [
                      UniformTemporalSubsample(8),
                      Lambda(lambda x: x / 255.0),
                      Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225))
                    ]
                  ),
                ),
              ]
          )
          val_dataset = pytorchvideo.data.Kinetics(
              data_path=os.path.join("/mnt/storage_6TB/user/kinetics400/val_subdivision", "my_val.csv"),
              clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
              transform=val_transform
          )
          return torch.utils.data.DataLoader(
              val_dataset,
              batch_size=self._BATCH_SIZE,
              num_workers=self._NUM_WORKERS,
          )
    # ...
########################################## Model ##########################################

#import pytorchvideo.models.resnet


'''
def make_kinetics_resnet():
  return pytorchvideo.models.resnet.create_resnet(
      input_channel=3, # RGB input from Kinetics
      model_depth=50, # For the tutorial let's just use a 50 layer network
      model_num_class=400, # Kinetics has 400 classes so we need out final head to align
      norm=nn.BatchNorm3d,
      activation=nn.ReLU,
  )
  
'''
######################################## Putting it all together ###########################

import torch.nn as nn
import torch.nn.functional as F

##########################################################################################
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
 ##########################################################################################
  def __init__(self):
      super().__init__()
      self.model = model()

  def forward(self, x):
      return self.model(x)

  def training_step(self, batch, batch_idx):
      # The model expects a video tensor of shape (B, C, T, H, W), which is the
      # format provided by the dataset
      y_hat = self.model(batch["video"])

      # Compute cross entropy loss, loss.backwards will be called behind the scenes
      # by PyTorchLightning after being returned from this method.
      loss = F.cross_entropy(y_hat, batch["label"])

      # Log the train loss to Tensorboard
      self.log("train_loss", loss.item())

      return loss

  def validation_step(self, batch, batch_idx):
      y_hat = self.model(batch["video"])
      loss = F.cross_entropy(y_hat, batch["label"])
      self.log("val_loss", loss)
      return loss

  def configure_optimizers(self):
      """
      Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
      usually useful for training video models.
      """
      return torch.optim.Adam(self.parameters(), lr=1e-1)
  
  ############################################ Train ############################################
  
def train():
    classification_module = VideoClassificationLightningModule()
    data_module = KineticsDataModule()
    trainer = pytorch_lightning.Trainer(devices=1, accelerator="gpu")
    trainer.fit(classification_module, data_module)
    
    
    
if __name__ == '__main__':
    train()

我该如何解决这个问题？

使用 PyTorchVideo 加载用于训练视频分类模型的动力学数据集时出错

问题描述投票：0回答：0

最新问题

使用 PyTorchVideo 加载用于训练视频分类模型的动力学数据集时出错

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0