我已将训练数据集分为 80% 的训练数据和 20% 的验证数据,并创建了 DataLoaders,如下所示。但是我不想限制我的模型的训练。所以我想到将我的数据分成 K(也许 5)个折叠并执行交叉验证。但是我不知道如何在分割数据集后将它们合并到我的数据加载器中。
train_size = int(0.8 * len(full_dataset))
validation_size = len(full_dataset) - train_size
train_dataset, validation_dataset = random_split(full_dataset, [train_size, validation_size])
full_loader = DataLoader(full_dataset, batch_size=4,sampler = sampler_(full_dataset), pin_memory=True)
train_loader = DataLoader(train_dataset, batch_size=4, sampler = sampler_(train_dataset))
val_loader = DataLoader(validation_dataset, batch_size=1, sampler = sampler_(validation_dataset))
我刚刚编写了一个与数据加载器和数据集一起使用的交叉验证函数。 这是我的代码,希望有帮助。
# define a cross validation function
def crossvalid(model=None,criterion=None,optimizer=None,dataset=None,k_fold=5):
train_score = pd.Series()
val_score = pd.Series()
total_size = len(dataset)
fraction = 1/k_fold
seg = int(total_size * fraction)
# tr:train,val:valid; r:right,l:left; eg: trrr: right index of right side train subset
# index: [trll,trlr],[vall,valr],[trrl,trrr]
for i in range(k_fold):
trll = 0
trlr = i * seg
vall = trlr
valr = i * seg + seg
trrl = valr
trrr = total_size
# msg
# print("train indices: [%d,%d),[%d,%d), test indices: [%d,%d)"
# % (trll,trlr,trrl,trrr,vall,valr))
train_left_indices = list(range(trll,trlr))
train_right_indices = list(range(trrl,trrr))
train_indices = train_left_indices + train_right_indices
val_indices = list(range(vall,valr))
train_set = torch.utils.data.dataset.Subset(dataset,train_indices)
val_set = torch.utils.data.dataset.Subset(dataset,val_indices)
# print(len(train_set),len(val_set))
# print()
train_loader = torch.utils.data.DataLoader(train_set, batch_size=50,
shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=50,
shuffle=True, num_workers=4)
train_acc = train(res_model,criterion,optimizer,train_loader,epoch=1)
train_score.at[i] = train_acc
val_acc = valid(res_model,criterion,optimizer,val_loader)
val_score.at[i] = val_acc
return train_score,val_score
train_score,val_score = crossvalid(res_model,criterion,optimizer,dataset=tiny_dataset)
为了直观地了解我们正在做的事情的正确性,请参阅下面的输出:
train indices: [0,0),[3600,18000), test indices: [0,3600)
14400 3600
train indices: [0,3600),[7200,18000), test indices: [3600,7200)
14400 3600
train indices: [0,7200),[10800,18000), test indices: [7200,10800)
14400 3600
train indices: [0,10800),[14400,18000), test indices: [10800,14400)
14400 3600
train indices: [0,14400),[18000,18000), test indices: [14400,18000)
14400 3600
查看使用 pytorch 和 sklearn 对 MNIST 数据集进行交叉验证。提问者实施了 kFold 交叉验证。特别看看他自己的回答(2019年11月23日10:34回答)。他不依赖 random_split() ,而是依赖 sklearn.model_selection.KFold ,并从那里构造一个数据集和一个数据加载器。
您可以通过使用 sklearn 和 dataloader 中的 KFOLD 来实现这一点。
import torch
from torch._six import int_classes as _int_classes
from torch import Tensor
from typing import Iterator, Optional, Sequence, List, TypeVar, Generic, Sized
T_co = TypeVar('T_co', covariant=True)
class Sampler(Generic[T_co]):
r"""Base class for all Samplers.
Every Sampler subclass has to provide an :meth:`__iter__` method, providing a
way to iterate over indices of dataset elements, and a :meth:`__len__` method
that returns the length of the returned iterators.
.. note:: The :meth:`__len__` method isn't strictly required by
:class:`~torch.utils.data.DataLoader`, but is expected in any
calculation involving the length of a :class:`~torch.utils.data.DataLoader`.
"""
def __init__(self, data_source: Optional[Sized]) -> None:
pass
def __iter__(self) -> Iterator[T_co]:
raise NotImplementedError
class SubsetRandomSampler(Sampler[int]):
r"""Samples elements randomly from a given list of indices, without replacement.
Args:
indices (sequence): a sequence of indices
generator (Generator): Generator used in sampling.
"""
indices: Sequence[int]
def __init__(self, indices: Sequence[int], generator=None) -> None:
self.indices = indices
self.generator = generator
def __iter__(self):
return (self.indices[i] for i in torch.randperm(len(self.indices), generator=self.generator))
def __len__(self):
return len(self.indices)
train_dataset = CustomDataset(data_dir=train_path, mode='train') )
val_dataset = CustomDataset(data_dir=train_path, mode='val') )
fold = KFold(5, shuffle=True, random_state=random_seed)
for fold,(tr_idx, val_idx) in enumerate(fold.split(dataset)):
# initialize the model
model = smp.FPN(encoder_name='efficientnet-b4', classes=12 , encoder_weights=None, activation='softmax2d')
loss = BCEDiceLoss()
optimizer = torch.optim.AdamW([
{'params': model.decoder.parameters(), 'lr': 1e-07/2},
{'params': model.encoder.parameters(), 'lr': 5e-07},
])
scheduler = ReduceLROnPlateau(optimizer, factor=0.15, patience=2)
print('#'*35); print('############ FOLD ',fold+1,' #############'); print('#'*35);
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
num_workers=1,
sampler = SubsetRandomSampler(tr_idx)
)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
batch_size=batch_size,
num_workers=1,
sampler = SubsetRandomSampler(val_idx)
)
因此,当您编写DataLoader部分时,请使用subsetRandomSampler,这样,dataloader中的采样器将始终随机采样kfold函数生成的train/valid索引。
通过结合 sklearn 的 KFold 和 torch.utils.data.Subset,这可以轻松实现。
kf = KFold(n_splits=params.training.k_folds, shuffle=True, random_state=42)
for i, (train_index, valid_index) in enumerate(kf.split(train_set_)):
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# + Splitting the dataset
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
train_set = Subset(train_set_, train_index)
valid_set = Subset(train_set_, valid_index)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# + Rest of the code using fold's train_loader and valid_loader
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++