我正在努力使用 PyTorch 创建一个机器学习项目,我意识到我的模型似乎无法学习 - 它总是输出一条几乎没有变化的平坦线,并且损失几乎没有减少。为了发现问题,我将程序缩减为另一个最小程序(有问题的程序),但问题仍然存在。
在这个简化的程序中,我的目标是建立一个接近 y=x^2 形状的模型。我从原来的较大项目中复制并粘贴了模型类
ANN
,该项目有 6 个功能。为了简单起见,我将前 5 个特征设置为 0,最后一个特征设置为实际的 x
(在 -2 到 2 的区间内均匀分隔的数字),并根据 x
通过我的 y
生成相应的
generate_targets()
值.
from torch import tensor, float32
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
class ANN(nn.Module):
def __init__(self, feature_num: int):
super(ANN,self).__init__()
self.layers = nn.Sequential(
nn.Linear(feature_num, 300),
nn.Tanh(),
nn.Linear(300, 200),
nn.Tanh(),
nn.Linear(200, 150),
nn.Tanh(),
nn.Linear(150, 50),
nn.Tanh(),
nn.Linear(50, 1)
)
def forward(self, x):
predictions = self.layers(x)
return predictions
class TestDataset(Dataset):
def __init__(self, sample_num):
self.sample_num = sample_num
self.func_max = 2
self.func_min = -2
self.unit = (self.func_max - self.func_min) / self.sample_num
self.targets = generate_targets(self.sample_num)
def __getitem__(self, index):
x = self.func_min + index * self.unit
return tensor([0, 0, 0, 0, 0, x], dtype=float32), self.targets[index]
def __len__(self):
return self.sample_num
# Generate the list of y
def generate_targets(count):
func_max = 2
func_min = -2
unit = (func_max - func_min)/count
target_list = []
for i in range(count):
x = func_min + unit*i
y = x ** 2
target_list.append(y)
return target_list
# The main program
def start_train():
sample_num = 500
train_data = TestDataset(sample_num)
train_dataloader = DataLoader(train_data, batch_size=10, shuffle=True)
model = ANN(6)
model.train()
mae_loss = nn.L1Loss()
optimizer = optim.Adam(model.parameters())
loss_list = []
for i in range(sample_num):
train_feature, train_target = next(iter(train_dataloader))
prediction = model(train_feature)
loss = mae_loss(prediction, train_target.float().unsqueeze(1))
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_list += [loss.item()]
print(f"iteration [{i + 1}/{sample_num}] Loss = {loss.item():.3f}")
plt.plot(range(sample_num), loss_list, marker='o', label='Validation')
plt.xlabel('iterations')
plt.ylabel('MAE loss')
plt.title('Loss vs Iteration')
plt.legend(loc='upper right')
plt.savefig('Debugger_Loss_Iter.png')
plt.close()
comparison(model, sample_num)
# Evaluate the trained model by plugging in each x coord and see the generated comparative graph
def comparison(model: ANN, sample_num: int) -> None:
model.eval()
prediction_list = []
for i in range(sample_num):
train_feature = tensor([0, 0, 0, 0, 0, i / sample_num], dtype=float32)
prediction = model(train_feature)
prediction_list += [prediction.item()]
target_list = generate_targets(sample_num)
x_list = [(i - sample_num / 2) / sample_num for i in list(range(sample_num))]
plt.plot(x_list, target_list, marker='o', label='Target')
plt.plot(x_list, prediction_list, marker='o', label='Prediction')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Prediction-target Comparison')
plt.legend(loc='upper right')
plt.savefig('Debugger_Comparison.png')
plt.close()
if __name__ == '__main__':
start_train()
执行后,输出类似于附图。 (在图像上,x标签的比例是错误的,但这不应该是学习问题的一部分)
我期望的正确输出是接近目标的形状,类似于附图。
我注意到通常张量不匹配是不学习的常见原因,我特意检查了输入和输出张量的形状和数据类型。不幸的是,我发现这些张量符合我的期望。
train:
train_feature: (10,6) float32
train_target.shape([10,1]) float32
prediction: ([10,1]) float32
comparison:
train_feature:([10,6]) float32
prediction([10,1]) float32
训练是正确的,但比较代码是错误的:您在范围(0,1)内均匀采样 x。尝试纠正并再次训练。
for i in range(sample_num):
train_feature = tensor([0, 0, 0, 0, 0, i / sample_num], dtype=float32)