为什么训练过程中测试集损失逐渐增大?

问题描述 投票:0回答:1

我是机器学习的新手,正在做一个分类任务。然而,在实验中,我发现训练集上的损失在减少,而测试集上的损失却在逐渐增加。我很困惑,请问这是为什么?

在此输入图片描述

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
from bert_vector import BertVector
from word2vec_vector import LoadData


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate=0.1):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return x


class MLPTrain:
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim,
                 lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1, test_data=None, train_data=None):
        self.train_data = train_data
        self.test_data = test_data
        self.input_dim = input_dim
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.output_dim = output_dim
        self.mlp = MLP(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr
        self.epochs = epochs
        self.batch_size = batch_size
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def train(self):
        self.mlp.to(self.device)
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=self.lr)

        train_loss_list = []
        test_loss_list = []
        train_acc_list = []
        test_acc_list = []

        for epoch in range(self.epochs):
            self.mlp.train()
            epoch_loss = 0.0
            correct_train = 0
            total_train = 0

            # Training Loop
            for i, (X, y) in enumerate(self.train_data):
                X = X.to(self.device)
                y = y.to(self.device)

                optimizer.zero_grad()
                y_pred = self.mlp(X)
                loss = self.criterion(y_pred, y)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

                # Calculate accuracy
                _, preds = torch.max(y_pred, 1)
                correct_train += (preds == y).sum().item()
                total_train += y.size(0)

            avg_train_loss = epoch_loss / len(self.train_data)
            train_loss_list.append(avg_train_loss)
            train_accuracy = correct_train / total_train * 100
            train_acc_list.append(train_accuracy)

            # Evaluate on the test set
            test_loss, test_accuracy = self.evaluate()

            # Save test loss and accuracy
            test_loss_list.append(test_loss)
            test_acc_list.append(test_accuracy)

            # Print results every epoch
            print(f"Epoch {epoch + 1}/{self.epochs} - Train Loss: {avg_train_loss:.4f}, "
                  f"Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

        # Plot the metrics
        self.plot_metrics(train_loss_list, test_loss_list, train_acc_list, test_acc_list)

    def evaluate(self):
        self.mlp.eval()
        with torch.no_grad():
            epoch_loss = 0.0
            correct_test = 0
            total_test = 0
            for X, y in self.test_data:
                X = X.to(self.device)
                y = y.to(self.device)
                y_pred = self.mlp(X)
                loss = self.criterion(y_pred, y)
                epoch_loss += loss.item()

                # Calculate accuracy
                _, preds = torch.max(y_pred, 1)
                correct_test += (preds == y).sum().item()
                total_test += y.size(0)

            avg_test_loss = epoch_loss / len(self.test_data)
            test_accuracy = correct_test / total_test * 100

            return avg_test_loss, test_accuracy

    def plot_metrics(self, train_loss_list, test_loss_list, train_acc_list, test_acc_list):
        # Plot loss and accuracy curves
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

        # Plot Loss
        ax1.plot(range(1, self.epochs + 1), train_loss_list, label='Train Loss')
        ax1.plot(range(1, self.epochs + 1), test_loss_list, label='Test Loss')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Loss')
        ax1.set_title('Loss Curve')
        ax1.legend()

        # Plot Accuracy
        ax2.plot(range(1, self.epochs + 1), train_acc_list, label='Train Accuracy')
        ax2.plot(range(1, self.epochs + 1), test_acc_list, label='Test Accuracy')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Accuracy')
        ax2.set_title('Accuracy Curve')
        ax2.legend()

        plt.show()


def get_DataLoader(X, y, batch_size=128):
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)
    dataset = TensorDataset(X, y)
    data = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data


def label_encoder(y):
    labelEncoder = LabelEncoder()
    y_encoded = labelEncoder.fit_transform(y)

    class_to_code = dict(zip(labelEncoder.classes_, range(len(labelEncoder.classes_))))

    return y_encoded, class_to_code


def start_train(x, y, input_dim, output_dim, lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    train_data = get_DataLoader(X_train, y_train, batch_size=batch_size)
    test_data = get_DataLoader(X_test, y_test, batch_size=batch_size)

    trainer = MLPTrain(
        train_data=train_data,
        test_data=test_data,
        input_dim=input_dim,
        hidden_dim1=256,
        hidden_dim2=128,
        hidden_dim3=32,
        output_dim=output_dim,
        lr=lr,
        epochs=epochs,
        batch_size=batch_size,
        dropout_rate=dropout_rate
    )
    trainer.train()


def main(llm, lr=0.001, batch_size=128, dropout_rate=0.3, epochs=50):
    if llm == "bert":
        bert = BertVector()
        X1, X2, y = bert.get_bert_vector()
        input_dim = X1.shape[1]
    else:
        word2vec = LoadData(model_type="Skip-Gram-model")
        X1, X2, y = word2vec.get_post_vector()
    X = np.concatenate((X1, X2), axis=1)
    # X = X1 + X2
    X = X2
    input_dim = X.shape[1]

    print("input dim: ", input_dim)
    y_encoded, class_to_code = label_encoder(y)
    output_dim = len(set(y_encoded))
    print("label class num: ", output_dim)

    start_train(X2, y_encoded, input_dim, output_dim, lr, epochs, batch_size, dropout_rate)

    print(class_to_code)


if __name__ == '__main__':
    llm = "word2vec"
    main(llm=llm, lr=0.001, batch_size=128, dropout_rate=0.1, epochs=100)

以上是我的代码,请问有什么问题吗?

machine-learning classification mlp
1个回答
0
投票

这通常被称为“过度拟合”。您的模型已经开始专门学习(“记忆”)训练集的详细信息,使其专门研究训练集,但代价是泛化到测试集。 避免过拟合的方法有很多;我建议您更深入地阅读该现象,以便您可以更清楚地理解相关概念。我想到的一些方法:

采用 dropout,或更激进的 dropout(你似乎使用了 10%),以防止模型过度依赖模型架构的任何一个部分
  • 根据数据集适当调整模型大小。过大的模型比小的模型更容易记住。
  • 添加更多训练数据。如果训练数据足够多,模型将无法有效记忆。如果收集数据很昂贵,您可以考虑数据增强技术。
  • 仔细安排训练,以便在验证/测试损失开始增加时停止。
© www.soinside.com 2019 - 2024. All rights reserved.