使用 PyTorch 构建多类分类模型 根据 2 个输入对犯罪类型进行分类 根据该数据集时间、位置如下
# main data
df = pd.read_csv('abstract_data.csv',delimiter=',')
# extracting the 2 inputs from the dataset
inputs_list=['TIME OCC','AREA']
inputs=df[inputs_list].to_numpy()
inputs.shape # of shape (893672, 2)
# extraction the labels
y=df['Crm Cd']
看起来像
0 510
1 330
2 480
3 343
4 354
...
893667 740
893668 230
893669 625
893670 745
893671 210
Name: Crm Cd, Length: 893672, dtype: int64
这里的类别由3位整数组成 有“138”个不同的班级 我使用以下代码提取了
# unique classes
unq=y.unique()
据我所知,为了制作一个包含 138 个类别的分类模型
我需要格式化由“3 位数字”组成的原始输出标签
138 范围内的合适数字
我用那个代码做到了
# mapping origonal classes numbers into the new classes ids
y2=[]
for i in y:
#print(f'i: {i}')
for index, element in enumerate(unq):
if i == element:
y2.append(index)
break
output=np.array(y2)
因此现在类别在范围内
# normlization of inputs data
inputs = (inputs - inputs.min(axis=0)) / (inputs.max(axis=0) - inputs.min(axis=0))
# converting to tensors
inputs_tensors=torch.tensor(data=inputs,dtype=torch.float32)
labels_tensors=torch.tensor(data=output,dtype=torch.float32)
labels_tensors=labels_tensors.type(torch.LongTensor)
# train & test split
size_split=int(0.9*len(inputs_tensors))
train_inputs, train_labels = inputs_tensors[:size_split], labels_tensors[:size_split]
test_inputs, test_labels= inputs_tensors[size_split:], labels_tensors[size_split:]
# creating a custom data set
lass CustomDataset(Dataset):
def __init__(self, inputs, labels):
self.inputs = inputs
self.labels = labels
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
input_data = self.inputs[idx]
label = self.labels[idx]
return input_data, label
train_dataset = CustomDataset(train_inputs, train_labels)
test_dataset = CustomDataset(test_inputs, test_labels)
# dataloader
BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
# device setting
device= 'cuda' if torch.cuda.is_available() else 'cpu'
# main training loop
def train_test_loop(
epochs:int,
model:torch.nn.Module,
loss_fn:torch.nn.Module,
optimizer:torch.optim.Optimizer,
accuracy_fn,
train_dataloader:torch.utils.data.DataLoader,
test_dataloader:torch.utils.data.DataLoader,
device:torch.device=device):
from tqdm.auto import tqdm
import numpy as np
from timeit import default_timer as timer
loss_hist=[]
epoch_hist=[]
train_time_start_= timer()
epochs = epochs
model.to(device)
train_acc = 0
for epoch in range(epochs):
train_loss = 0
print(f'epoch: {epoch}\n-----')
for batch,(X,y) in enumerate(train_dataloader):
X, y = X.to(device), y.to(device)
model.train()
y_pred=model(X)
loss=loss_fn(y_pred,y)
loss_hist.append(loss)
train_loss += loss
train_acc += accuracy_fn(y_true=y, y_pred=y_pred.argmax(dim=1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_hist.append(batch)
if batch % 400 == 0:
print(f' batch: {batch} | loss: {loss} | ')
print(f" Looked at {batch * len(X)}/{len(train_dataloader.dataset)} samples")
train_loss /= len(train_dataloader)
print(f'total_loss: {train_loss}')
train_acc /= len(train_dataloader)
print(f' acc: {train_acc} | ')
# base model
class MLPClassifier(nn.Module):
def __init__(self):
super().__init__()
self.linear1 = nn.Linear(2, 100) # Input layer
self.linear2 = nn.Linear(100, 512) # Hidden layer
self.linear3 = nn.Linear(512, 200)
self.output = nn.Linear(200, 138) # Output layer with 138 classes
def forward(self, x):
x = torch.relu(self.linear1(x))
x = torch.relu(self.linear2(x))
x = torch.relu(self.linear3(x))
x = self.output(x)
return torch.softmax(x, dim=1)
model = MLPClassifier()
# optiizer and loss
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
我也尝试了 adam 优化器,但结果是一样的
# model training
def accuracy_fn(y_true, y_pred):
"""Calculates accuracy between truth labels and predictions.
Args:
y_true (torch.Tensor): Truth labels for predictions.
y_pred (torch.Tensor): Predictions to be compared to predictions.
Returns:
[torch.float]: Accuracy value between y_true and y_pred, e.g. 78.45
"""
correct = torch.eq(y_true, y_pred).sum().item()
acc = (correct / len(y_pred)) * 100
return acc
accuracy_fn=accuracy_fn
t1=train_test_loop(epochs=10,model=model,loss_fn=loss,accuracy_fn=accuracy_fn,optimizer=optimizer,train_dataloader=train_dataloader,test_dataloader=test_dataloader)
这将运行并且损失将继续损失振荡 在“4.908379554748535 和 4.708379554748535”之间 即使我尝试了诸如更改模型层和参数之类的所有方法,也从未收敛 以及学习率和优化器 我真的感觉没希望了
我期待它会收敛,但它没有,它只是振荡 与损失
我尝试更改“模型架构、层和参数”, 优化器,学习率' 但没有任何作用 我怀疑数据集格式有问题,但我不确定
我没有解决方案,但我的建议是从小而简单的开始,然后依次增加更多的复杂性。当您设计机器学习模型时,这通常是一个好主意。
因此,我首先从数据中仅选择两个类,并尝试构建一个能够预测它们的模型。如果进展顺利,请添加更多类,看看进展如何。 还从一个非常简单的模型开始,三层和小层。
另一件事是,你只有两个特征,但有 138 个类。这可能是一个问题。 您甚至确定这些功能和类之间存在可以学习的关系吗? 同样有趣的是知道每个类别有多少数据。每个类别是否相等,或者有些类别有大量数据,而有些类别只有几个样本。