对于基于 Kinetics 数据集的视频分类任务,我按照 this page 中的示例来训练我的模型。数据集具有以下架构:
正如网页中提到的,train.csv 和 validation.csv 是必要的。遵循本说明。所以,我准备了如下结构的 csv 文件:
不幸的是,我遇到了以下错误:
ValueError: invalid literal for int() with base 10:'Dancing'
所以,我按照 this page 中的说明进行操作,在这方面,我将不同标签文件夹中的所有视频复制到一个主文件夹中,而不是在 csv 文件中使用文本标签,我将数字作为每个标签,如下所示:
但在这种情况下,我收到以下错误:
Exception has occurred: RuntimeError
Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/miniconda3/envs/new/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
data = fetcher.fetch(index)
File "/home/miniconda3/envs/new/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
data.append(next(self.dataset_iter))
File "/home/miniconda3/envs/new/lib/python3.10/site-packages/pytorchvideo/data/labeled_video_dataset.py", line 225, in __next__
raise RuntimeError(
RuntimeError: Failed to load video after 10 retries.
File "/home/main/light_vid_class/training.py", line 199, in train
trainer.fit(classification_module, data_module)
File "/home/main/light_vid_class/training.py", line 204, in <module>
train()
这是我的剧本:
import os
import pytorch_lightning
import pytorchvideo.data
import torch.utils.data
import torch.nn as nn
import torch
from Spatial_Encoder import Spatial_Encoder as model
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
########################################### Transformations #######################################
from pytorchvideo.transforms import (
ApplyTransformToKey,
Normalize,
RandomShortSideScale,
RemoveKey,
ShortSideScale,
UniformTemporalSubsample)
from torchvision.transforms import (
Compose,
Lambda,
RandomCrop,
RandomHorizontalFlip
)
##########################################################################################
class KineticsDataModule(pytorch_lightning.LightningDataModule):
def __init__(self):
super(KineticsDataModule, self).__init__()
# Dataset configuration
#self._DATA_PATH = "/mnt/storage_6TB/user/kinetics400"
self._CLIP_DURATION = 2 # Duration of sampled clip for each video
self._BATCH_SIZE = 16
self._NUM_WORKERS = 8 # Number of parallel processes fetching data
# ...
def train_dataloader(self):
"""
Create the Kinetics train partition from the list of video labels
in {self._DATA_PATH}/train.csv. Add transform that subsamples and
normalizes the video before applying the scale, crop and flip augmentations.
"""
train_transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(8),
Lambda(lambda x: x / 255.0),
Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
RandomShortSideScale(min_size=224, max_size=224),
RandomCrop(244),
RandomHorizontalFlip(p=0.5),
]
),
),
]
)
train_dataset = pytorchvideo.data.Kinetics(
data_path=os.path.join("/mnt/storage_6TB/user/kinetics400/train_subdivision", "my_train.csv"),
clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
transform=train_transform
)
return torch.utils.data.DataLoader(
train_dataset,
batch_size=self._BATCH_SIZE,
num_workers=self._NUM_WORKERS,
)
def val_dataloader(self):
"""
Create the Kinetics val partition from the list of video labels
in {self._DATA_PATH}/val.csv. Add transform that subsamples and
normalizes the video before applying the scale, crop and flip augmentations.
"""
val_transform = Compose(
[
ApplyTransformToKey(
key="video",
transform=Compose(
[
UniformTemporalSubsample(8),
Lambda(lambda x: x / 255.0),
Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225))
]
),
),
]
)
val_dataset = pytorchvideo.data.Kinetics(
data_path=os.path.join("/mnt/storage_6TB/user/kinetics400/val_subdivision", "my_val.csv"),
clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
transform=val_transform
)
return torch.utils.data.DataLoader(
val_dataset,
batch_size=self._BATCH_SIZE,
num_workers=self._NUM_WORKERS,
)
# ...
########################################## Model ##########################################
#import pytorchvideo.models.resnet
'''
def make_kinetics_resnet():
return pytorchvideo.models.resnet.create_resnet(
input_channel=3, # RGB input from Kinetics
model_depth=50, # For the tutorial let's just use a 50 layer network
model_num_class=400, # Kinetics has 400 classes so we need out final head to align
norm=nn.BatchNorm3d,
activation=nn.ReLU,
)
'''
######################################## Putting it all together ###########################
import torch.nn as nn
import torch.nn.functional as F
##########################################################################################
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
##########################################################################################
def __init__(self):
super().__init__()
self.model = model()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
# The model expects a video tensor of shape (B, C, T, H, W), which is the
# format provided by the dataset
y_hat = self.model(batch["video"])
# Compute cross entropy loss, loss.backwards will be called behind the scenes
# by PyTorchLightning after being returned from this method.
loss = F.cross_entropy(y_hat, batch["label"])
# Log the train loss to Tensorboard
self.log("train_loss", loss.item())
return loss
def validation_step(self, batch, batch_idx):
y_hat = self.model(batch["video"])
loss = F.cross_entropy(y_hat, batch["label"])
self.log("val_loss", loss)
return loss
def configure_optimizers(self):
"""
Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
usually useful for training video models.
"""
return torch.optim.Adam(self.parameters(), lr=1e-1)
############################################ Train ############################################
def train():
classification_module = VideoClassificationLightningModule()
data_module = KineticsDataModule()
trainer = pytorch_lightning.Trainer(devices=1, accelerator="gpu")
trainer.fit(classification_module, data_module)
if __name__ == '__main__':
train()
我该如何解决这个问题?