我是机器学习的新手,正在做一个分类任务。然而,在实验中,我发现训练集上的损失在减少,而测试集上的损失却在逐渐增加。我很困惑,请问这是为什么?
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
from bert_vector import BertVector
from word2vec_vector import LoadData
class MLP(nn.Module):
def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate=0.1):
super(MLP, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim1)
self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
self.fc4 = nn.Linear(hidden_dim3, output_dim)
self.dropout = nn.Dropout(dropout_rate)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.relu(self.fc3(x))
x = self.dropout(x)
x = self.fc4(x)
return x
class MLPTrain:
def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim,
lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1, test_data=None, train_data=None):
self.train_data = train_data
self.test_data = test_data
self.input_dim = input_dim
self.hidden_dim1 = hidden_dim1
self.hidden_dim2 = hidden_dim2
self.hidden_dim3 = hidden_dim3
self.output_dim = output_dim
self.mlp = MLP(input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim, dropout_rate)
self.criterion = nn.CrossEntropyLoss()
self.lr = lr
self.epochs = epochs
self.batch_size = batch_size
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train(self):
self.mlp.to(self.device)
optimizer = torch.optim.Adam(self.mlp.parameters(), lr=self.lr)
train_loss_list = []
test_loss_list = []
train_acc_list = []
test_acc_list = []
for epoch in range(self.epochs):
self.mlp.train()
epoch_loss = 0.0
correct_train = 0
total_train = 0
# Training Loop
for i, (X, y) in enumerate(self.train_data):
X = X.to(self.device)
y = y.to(self.device)
optimizer.zero_grad()
y_pred = self.mlp(X)
loss = self.criterion(y_pred, y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
# Calculate accuracy
_, preds = torch.max(y_pred, 1)
correct_train += (preds == y).sum().item()
total_train += y.size(0)
avg_train_loss = epoch_loss / len(self.train_data)
train_loss_list.append(avg_train_loss)
train_accuracy = correct_train / total_train * 100
train_acc_list.append(train_accuracy)
# Evaluate on the test set
test_loss, test_accuracy = self.evaluate()
# Save test loss and accuracy
test_loss_list.append(test_loss)
test_acc_list.append(test_accuracy)
# Print results every epoch
print(f"Epoch {epoch + 1}/{self.epochs} - Train Loss: {avg_train_loss:.4f}, "
f"Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
# Plot the metrics
self.plot_metrics(train_loss_list, test_loss_list, train_acc_list, test_acc_list)
def evaluate(self):
self.mlp.eval()
with torch.no_grad():
epoch_loss = 0.0
correct_test = 0
total_test = 0
for X, y in self.test_data:
X = X.to(self.device)
y = y.to(self.device)
y_pred = self.mlp(X)
loss = self.criterion(y_pred, y)
epoch_loss += loss.item()
# Calculate accuracy
_, preds = torch.max(y_pred, 1)
correct_test += (preds == y).sum().item()
total_test += y.size(0)
avg_test_loss = epoch_loss / len(self.test_data)
test_accuracy = correct_test / total_test * 100
return avg_test_loss, test_accuracy
def plot_metrics(self, train_loss_list, test_loss_list, train_acc_list, test_acc_list):
# Plot loss and accuracy curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# Plot Loss
ax1.plot(range(1, self.epochs + 1), train_loss_list, label='Train Loss')
ax1.plot(range(1, self.epochs + 1), test_loss_list, label='Test Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.set_title('Loss Curve')
ax1.legend()
# Plot Accuracy
ax2.plot(range(1, self.epochs + 1), train_acc_list, label='Train Accuracy')
ax2.plot(range(1, self.epochs + 1), test_acc_list, label='Test Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.set_title('Accuracy Curve')
ax2.legend()
plt.show()
def get_DataLoader(X, y, batch_size=128):
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)
dataset = TensorDataset(X, y)
data = DataLoader(dataset, batch_size=batch_size, shuffle=True)
return data
def label_encoder(y):
labelEncoder = LabelEncoder()
y_encoded = labelEncoder.fit_transform(y)
class_to_code = dict(zip(labelEncoder.classes_, range(len(labelEncoder.classes_))))
return y_encoded, class_to_code
def start_train(x, y, input_dim, output_dim, lr=0.001, epochs=30, batch_size=128, dropout_rate=0.1):
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
train_data = get_DataLoader(X_train, y_train, batch_size=batch_size)
test_data = get_DataLoader(X_test, y_test, batch_size=batch_size)
trainer = MLPTrain(
train_data=train_data,
test_data=test_data,
input_dim=input_dim,
hidden_dim1=256,
hidden_dim2=128,
hidden_dim3=32,
output_dim=output_dim,
lr=lr,
epochs=epochs,
batch_size=batch_size,
dropout_rate=dropout_rate
)
trainer.train()
def main(llm, lr=0.001, batch_size=128, dropout_rate=0.3, epochs=50):
if llm == "bert":
bert = BertVector()
X1, X2, y = bert.get_bert_vector()
input_dim = X1.shape[1]
else:
word2vec = LoadData(model_type="Skip-Gram-model")
X1, X2, y = word2vec.get_post_vector()
X = np.concatenate((X1, X2), axis=1)
# X = X1 + X2
X = X2
input_dim = X.shape[1]
print("input dim: ", input_dim)
y_encoded, class_to_code = label_encoder(y)
output_dim = len(set(y_encoded))
print("label class num: ", output_dim)
start_train(X2, y_encoded, input_dim, output_dim, lr, epochs, batch_size, dropout_rate)
print(class_to_code)
if __name__ == '__main__':
llm = "word2vec"
main(llm=llm, lr=0.001, batch_size=128, dropout_rate=0.1, epochs=100)
以上是我的代码,请问有什么问题吗?
这通常被称为“过度拟合”。您的模型已经开始专门学习(“记忆”)训练集的详细信息,使其专门研究训练集,但代价是泛化到测试集。 避免过拟合的方法有很多;我建议您更深入地阅读该现象,以便您可以更清楚地理解相关概念。我想到的一些方法:
采用 dropout,或更激进的 dropout(你似乎使用了 10%),以防止模型过度依赖模型架构的任何一个部分