我有这样的训练功能:
def training():
model.train()
train_mae = []
progress = tqdm(train_dataloader, desc='Training')
for batch_index, batch in enumerate(progress):
x = batch['x'].to(device)
x_lengths = batch['x_lengths'].to(device)
y = batch['y'].to(device)
y_type = batch['y_type'].to(device)
y_valid_indices = batch['y_valid_indices'].to(device)
# Zero Gradients
optimizer.zero_grad()
# Forward pass
y_first, y_second = model(x)
losses = []
for j in range(len(x_lengths)):
x_length = x_lengths[j].item()
if y_type[j].item() == 0:
predicted = y_first[j]
else:
predicted = y_second[j]
actual = y[j]
valid_mask = torch.zeros_like(predicted, dtype=torch.bool)
valid_mask[:x_length] = 1
# Padding of -1 is removed from y
indices_mask = y[j].ne(-1)
valid_indices = y[j][indices_mask]
valid_predicted = predicted[valid_mask]
valid_actual = actual[valid_mask]
loss = mae_fn(valid_predicted, valid_actual, valid_indices)
losses.append(loss)
# Backward pass and update
loss = torch.stack(losses).mean() # This fails due to different shapes
loss.backward()
optimizer.step()
train_mae.append(loss.detach().cpu().numpy())
progress.set_description(
f"mae: {loss.detach().cpu().numpy():.4f}"
)
# Return the average MAEs for y type
return (
np.mean(train_mae)
)
def mae_fn(output, target, indices):
clipped_target = torch.clip(target, min=0, max=1)
maes = F.l1_loss(output, clipped_target, reduction='none')
return maes[indices]
显然不能叠加这些损失,因为它们由于索引而具有不同的形状。对
maes[indices]
取平均值可以解决问题,但会导致非常严重的测试损失。由于索引根据 y_type 确定形状,因此我该如何计算此处的损失。
您能否获得每个批次的平均值,然后根据每个批次的大小合并这些平均值?与较小的批次相比,较大的批次对最终平均值的贡献更大。这应该比对所有批次进行平均平均更稳定。下面的例子。
import torch
#Test data
losses_perbatch = [torch.randn(8, 1), torch.randn(4, 1), torch.randn(2, 1)]
#Weighted mean
total_samples = sum([len(batch) for batch in losses_perbatch])
weighted_mean_perbatch = torch.tensor([batch.mean() * len(batch)
for batch in losses_perbatch]) / total_samples
final_weighted_loss = sum(weighted_mean_perbatch)