我正在尝试使用 2 个 GPU 来训练超级梯度库中的 yolo nas 模型。
我参考了这个链接
另请参阅这个
基本上,我正在尝试使用 2 个 GPU 进行 multipgu 训练。我正在尝试使用分布式数据并行策略,但是由于某种原因它给出了错误。有些采样器没有钥匙或其他东西。 这是代码:
import torch
import os
from PIL import Image
from super_gradients.training import Trainer, dataloaders, models
from super_gradients.training.dataloaders.dataloaders import (
coco_detection_yolo_format_train, coco_detection_yolo_format_val
)
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import DetectionMetrics_050
from super_gradients.training.models.detection_models.pp_yolo_e import (
PPYoloEPostPredictionCallback
)
from super_gradients.training.utils.distributed_training_utils import setup_device
class config:
#trainer params
CHECKPOINT_DIR = 'checkpoints' #specify the path you want to save checkpoints to
EXPERIMENT_NAME = 'experiment_v2' #specify the experiment name
#dataset params
DATA_DIR = 'yolo_data' #parent directory to where data lives
TRAIN_IMAGES_DIR = 'train/images' #child dir of DATA_DIR where train images are
TRAIN_LABELS_DIR = 'train/labels' #child dir of DATA_DIR where train labels are
VAL_IMAGES_DIR = 'val/images' #child dir of DATA_DIR where validation images are
VAL_LABELS_DIR = 'val/labels' #child dir of DATA_DIR where validation labels are
# if you have a test set
TEST_IMAGES_DIR = 'test/images' #child dir of DATA_DIR where test images are
TEST_LABELS_DIR = 'test/labels' #child dir of DATA_DIR where test labels are
CLASSES = ['Face'] #what class names do you have
NUM_CLASSES = len(CLASSES)
#dataloader params - you can add whatever PyTorch dataloader params you have
#could be different across train, val, and test
DATALOADER_PARAMS={
'batch_size':64,
'num_workers':4
}
# model params
MODEL_NAME = 'yolo_nas_l' # choose from yolo_nas_s, yolo_nas_m, yolo_nas_l
PRETRAINED_WEIGHTS = 'coco' #only one option here: coco
DEVICE = 'cuda' if torch.cuda.is_available() else "cpu"
setup_device(multi_gpu='DDP', num_gpus=2)
trainer = Trainer(experiment_name=config.EXPERIMENT_NAME, ckpt_root_dir=config.CHECKPOINT_DIR)
train_data = coco_detection_yolo_format_train(
dataset_params={
'data_dir': config.DATA_DIR,
'images_dir': config.TRAIN_IMAGES_DIR,
'labels_dir': config.TRAIN_LABELS_DIR,
'classes': config.CLASSES
},
dataloader_params=config.DATALOADER_PARAMS
)
val_data = coco_detection_yolo_format_val(
dataset_params={
'data_dir': config.DATA_DIR,
'images_dir': config.VAL_IMAGES_DIR,
'labels_dir': config.VAL_LABELS_DIR,
'classes': config.CLASSES
},
dataloader_params=config.DATALOADER_PARAMS
)
model = models.get(config.MODEL_NAME,
num_classes=config.NUM_CLASSES,
pretrained_weights=config.PRETRAINED_WEIGHTS
).to(config.DEVICE)
train_params = {
"average_best_models":True,
"warmup_mode": "linear_epoch_step",
"warmup_initial_lr": 8e-6,
"lr_warmup_epochs": 5,
"initial_lr": 40e-4,
"lr_mode": "cosine",
"cosine_final_lr_ratio": 0.1,
"optimizer": "Adam",
"optimizer_params": {"weight_decay": 0.0001},
"zero_weight_decay_on_bias_and_bn": True,
"ema": True,
"ema_params": {"decay": 0.9, "decay_type": "threshold"},
"max_epochs": 300,
"mixed_precision": True,
"loss": PPYoloELoss(
use_static_assigner=False,
# NOTE: num_classes needs to be defined here
num_classes=config.NUM_CLASSES,
reg_max=16
),
"valid_metrics_list": [
DetectionMetrics_050(
score_thres=0.1,
top_k_predictions=300,
# NOTE: num_classes needs to be defined here
num_cls=config.NUM_CLASSES,
normalize_targets=True,
post_prediction_callback=PPYoloEPostPredictionCallback(
score_threshold=0.01,
nms_top_k=1000,
max_predictions=300,
nms_threshold=0.7
)
)
],
"metric_to_watch": '[email protected]'
}
trainer.train(model=model,
training_params=train_params,
train_loader=train_data,
valid_loader=val_data)
这里有错误!
raceback (most recent call last):
File "train.py", line 72, in <module>
val_data = coco_detection_yolo_format_val(
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 286, in coco_detection_yolo_format_val
return get_data_loader(
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 76, in get_data_loader
dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 96, in _process_dataloader_params
dataloader_params = _process_sampler_params(dataloader_params, dataset, default_dataloader_params)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 121, in _process_sampler_params
dataloader_params = _instantiate_sampler(dataset, dataloader_params)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 144, in _instantiate_sampler
sampler_name = list(dataloader_params["sampler"].keys())[0]
AttributeError: 'DistributedSampler' object has no attribute 'keys'
Caching annotations: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3226/3226 [00:00<00:00, 5659.64it/s]
Traceback (most recent call last):
File "train.py", line 72, in <module>
val_data = coco_detection_yolo_format_val(
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 286, in coco_detection_yolo_format_val
return get_data_loader(
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 76, in get_data_loader
dataloader_params = _process_dataloader_params(cfg, dataloader_params, dataset, train)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 96, in _process_dataloader_params
dataloader_params = _process_sampler_params(dataloader_params, dataset, default_dataloader_params)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 121, in _process_sampler_params
dataloader_params = _instantiate_sampler(dataset, dataloader_params)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/dataloaders/dataloaders.py", line 144, in _instantiate_sampler
sampler_name = list(dataloader_params["sampler"].keys())[0]
AttributeError: 'DistributedSampler' object has no attribute 'keys'
[2023-06-08 19:04:47] WARNING - api.py - Sending process 74086 closing signal SIGTERM
WARNING: Logging before flag parsing goes to stderr.
W0608 19:04:47.502784 140609054398272 api.py:699] Sending process 74086 closing signal SIGTERM
[2023-06-08 19:04:47] ERROR - api.py - failed (exitcode: 1) local_rank: 0 (pid: 74081) of binary: /opt/conda/bin/python3
E0608 19:04:47.773856 140609054398272 api.py:673] failed (exitcode: 1) local_rank: 0 (pid: 74081) of binary: /opt/conda/bin/python3
Traceback (most recent call last):
File "train.py", line 56, in <module>
setup_device(multi_gpu='DDP', num_gpus=2)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/common/decorators/factory_decorator.py", line 36, in wrapper
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/utils/distributed_training_utils.py", line 240, in setup_device
setup_gpu(multi_gpu, num_gpus)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/utils/distributed_training_utils.py", line 278, in setup_gpu
restart_script_with_ddp(num_gpus=num_gpus)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/super_gradients/training/utils/distributed_training_utils.py", line 387, in restart_script_with_ddp
elastic_launch(config=config, entrypoint=sys.executable)(*sys.argv, *EXTRA_ARGS)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
train.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-06-08_19:04:47
host : 0d9c785c0a81
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 74081)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
尝试添加设备配置 setup_device(设备='cuda',multi_gpu='DDP',num_gpus=2)