我的梯度下降实现有什么问题（带有铰链损失的SVM分类器）

Question

我正在尝试在 jupyter 笔记本中使用 python 和 numpy 从头开始实现和训练 SVM 多类分类器。

我一直使用 CS231n 课程作为我的知识基础，尤其是这个页面：https://cs231n.github.io/optimization-1/，其中讨论了梯度下降。我已经实现了一个类，SVM，我相信它走在正确的轨道上。

以下是该课程的基本简介：

class SVM:
  def __init__(self):
    self.weights = np.random.randn(len(labels), X_train.shape[1]) * 0.1
    self.history = []

  def predict(self, X):
    '''
    returns class predictions in np array of size
    n x num_classes, where n is the number of examples in X
    '''

    #matrix multiplication to apply weights to X
    bounds = self.weights @ X.T

    #return the predictions
    return np.array(bounds).T

  def loss(self, scores, y, delta=1):
    '''computes the loss'''
    #calculate and return the loss for a prediction and corresponding truth label
    #hinge loss in this case
    total_loss = 0

    #compute loss for each example...
    for i in range(len(scores)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      correct_score = scores_of_x[label]
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #use the scores for example x to compute the loss at x
      wj_xi = correct_score           #these should be a vector of INCORRECT scores
      wyi_xi = incorrect_scores       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0
      loss = np.sum(losses)           #sum the losses

      #add to the total loss
      total_loss += loss

    #return the loss
    avg_loss = total_loss / len(scores)
    return avg_loss

  def gradient(self, scores, X, y, delta=1):
    '''computes the gradient'''
    #calculate the loss and the gradient of the loss function
    #gradient of hinge loss function
    gradient = np.zeros(self.weights.shape)

    #calculate the gradient in each example in x
    for i in range(len(X)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      x = X[i]
      correct_score = scores_of_x[label]
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #
      ##
      ### start by computing the gradient of the weights of the correct classifier
      ##
      #
      wj_xi = correct_score           #these should be a vector of INCORRECT scores
      wyi_xi = incorrect_scores       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0

      #get number of nonzero losses, and scale data vector by them to get the loss
      num_contributing_classifiers = np.count_nonzero(losses)
      #print(f"Num loss contributors: {num_contributing_classifiers}")
      g = -1 * x * num_contributing_classifiers   #NOTE the -, very important here, doesn't apply to other scores

      #add the gradient of the correct classifier to the gradient
      gradient[label] += g  #because arrays are 0-indexed, but the labels are 1-indexed
      # print(f"correct label: {label}")
      #print(f"gradient:\n{gradient}")
      #
      ##
      ### then, compute the gradient of the weights for each incorrect classifier
      ##
      #
      for j in range(len(scores_of_x)):

        #skip the correct score, since we already did it
        if j == label:
          continue
        wj_xi = scores_of_x[j]          #should be a vector containing the score of the CURRENT classifier
        wyi_xi = correct_score          #should be a vector containing the score of the CORRECT classifier
        wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
        loss = np.maximum(0, wy_xi)   #lower bound the loss at 0

        #get whether this classifier contributed to the loss, and scale the data vector by that to get the gradient
        contributed_to_loss = 0
        if loss > 0:
          contributed_to_loss = 1

        g = x * contributed_to_loss        #either times 1 or times 0

        #add the gradient of the incorrect classifier to the gradient
        gradient[j] += g


    #divide the gradient by number of examples to get the average gradient
    return gradient / len(X)

  def fit(self, X, y, epochs = 1000, batch_size = 256, lr=1e-2, verbose=True):
    #gradient descent loop
    for epoch in range(epochs):
      self.history.append({'epoch': epoch})

      #create a batch of samples to calculate the gradient
      #NOTE: this significantly boosts the speed of training
      indices = np.random.choice(len(X), batch_size, replace=False)
      X_batch = X.iloc[indices]
      y_batch = y.iloc[indices]
      
      X_batch = X_batch.to_numpy()
      y_batch = y_batch.to_numpy()

      #evaluate class scores on training set
      predictions = self.predict(X_batch)
      predicted_classes = np.argmax(predictions, axis=1)

      #compute the loss: average hinge loss
      loss = self.loss(predictions, y_batch)
      self.history[-1]['loss'] = loss

      #compute accuracy on the test set, for an intuitive metric
      accuracy = np.mean(predicted_classes == y_batch)
      self.history[-1]['accuracy'] = accuracy
      
      #print progress
      if epoch%50 == 0 and verbose:
        print(f"Epoch: {epoch} | Loss: {loss} | Accuracy: {accuracy} | LR: {lr} \n")


      #compute the gradient on the scores assigned by the classifier
      gradient = self.gradient(predictions, X_batch, y_batch)
      
      #backpropagate the gradient to the weights + bias
      step = gradient * lr

      #perform a parameter update, in the negative??? direction of the gradient
      self.weights += step

这是我的实现。 fit() 方法是训练传入数据的权重的方法。我正处于这样一个阶段：从一次迭代到下一次迭代，损失往往会减少。

但是，问题是，即使损失减少，准确率也会下降到零。

我知道它们没有直接关系，但随着损失的下降，我的准确率不应该普遍上升吗？这让我觉得我在loss()和gradient()方法中做错了什么。但是，我似乎找不到我错在哪里。而且，有时，我的损失会从一个时期到下一个时期增加。这可能是我对梯度进行批量评估的影响，但我不确定。

这是我的 Jupyter 笔记本的链接，它应该可以让您在当前状态下运行我的代码：

https://colab.research.google.com/drive/12z4DevKDicmT4iE6AlMGrRiN6He8R9_4#scrollTo=uBTUQlscWksP 这是我正在使用的数据集的链接：https://www.kaggle.com/datasets/taweilo/fish-species-sampling-weight-and-height-data/code

Answer 1

对于遇到此线程的任何人，我解决了我的问题。事实证明，我误读了公式，并且混淆了其中两个术语的位置。我原来的代码中的注释实际上是正确的。变量 wj_xi 和 wyi_xi 实际上应该这样定义（在梯度和损失方法中）：

wj_xi = incorrect_scores     #these should be a vector of INCORRECT scores
wyi_xi = correct_score       #this should be a vector of the CORRECT score

我把它们翻过来。另外，正如回复中提到的，在梯度的负方向上更新权重很重要，如下所示：

#执行一次参数更新，在梯度的负方向上

self.weights -= step

完整代码：

class SVM:
  def __init__(self):
    self.weights = np.random.randn(len(labels), X_train.shape[1]) * 0.1  #9 sets of weights (9 classes) and 4 entries per set (3 features + 1 bias, shape= (9x4))
    #X_train.shape[1] = 11 features + 1 bias const = 12 for 11 features plus one extra place for bias
    #10 comes from the 10 possible class labels for wine quality, 1, 2, ..., 9, 10

    self.history = []

  def predict(self, X):
    '''
    returns class predictions in np array of size
    n x 10, where n is the number of examples in X
    '''

    #matrix multiplication to apply weights to X
    bounds = self.weights @ X.T

    #return the predictions
    return np.array(bounds).T

  def loss(self, scores, y, delta=1):
    '''
    returns the average hinge loss of the batch
    '''
    #calculate and return the loss for a prediction and corresponding truth label
    #hinge loss in this case
    total_loss = 0

    #compute loss for each example...
    for i in range(len(scores)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      correct_score = scores_of_x[label]
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #use the scores for example x to compute the loss at x
      wj_xi = incorrect_scores           #these should be a vector of INCORRECT scores
      wyi_xi = correct_score       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0
      loss = np.sum(losses)           #sum the losses

      #add to the total loss
      total_loss += loss

    #return the loss
    avg_loss = total_loss / len(scores)  #divide by the number of examples to fund the average hinge loss per-example
    return avg_loss

  def gradient(self, scores, X, y, delta=1):
    '''
    returns the gradient of the loss function
    '''
    #calculate the loss and the gradient of the loss function
    #gradient of hinge loss function
    gradient = np.zeros(self.weights.shape)

    #calculate the gradient in each example in x
    for i in range(len(X)):
      #extract values for this example
      scores_of_x = scores[i]
      label = y[i]
      x = X[i]
      correct_score = scores_of_x[label]  #because arrays are 0-indexed, but the labels are 1-indexed
      incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))

      #
      ##
      ### start by computing the gradient of the weights of the correct classifier
      ##
      #
      wj_xi = incorrect_scores           #these should be a vector of INCORRECT scores
      wyi_xi = correct_score       #this should be a vector of the CORRECT score
      wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
      losses = np.maximum(0, wy_xi)   #lower bound the losses at 0

      #get number of nonzero losses, and scale data vector by them to get the loss
      num_contributing_classifiers = np.count_nonzero(losses)
      #print(f"Num loss contributors: {num_contributing_classifiers}")
      g = -1 * x * num_contributing_classifiers   #NOTE the -, very important here, doesn't apply to other scores

      #add the gradient of the correct classifier to the gradient
      gradient[label] += g  #because arrays are 0-indexed, but the labels are 1-indexed
      # print(f"correct label: {label}")
      #print(f"gradient:\n{gradient}")
      #
      ##
      ### then, compute the gradient of the weights for each incorrect classifier
      ##
      #
      for j in range(len(scores_of_x)):

        #skip the correct score, since we already did it
        if j == label:
          continue
        wj_xi = scores_of_x[j]          #should be a vector containing the score of the CURRENT classifier
        wyi_xi = correct_score          #should be a vector containing the score of the CORRECT classifier
        wy_xi = wj_xi - wyi_xi + delta  #core of the hinge loss formula
        loss = np.maximum(0, wy_xi)   #lower bound the loss at 0

        #get whether this classifier contributed to the loss, and scale the data vector by that to get the gradient
        contributed_to_loss = 0
        if loss > 0:
          contributed_to_loss = 1

        g = x * contributed_to_loss        #either times 1 or times 0

        #add the gradient of the incorrect classifier to the gradient
        gradient[j] += g


    #divide the gradient by number of examples to get the average gradient
    return gradient / len(X)

  def fit(self, X, y, epochs = 1000, batch_size = 256, lr=1e-2, verbose=True):
    '''
    trains the model on the training set
    '''
    #gradient descent loop
    for epoch in range(epochs):
      self.history.append({'epoch': epoch})

      #create a batch of samples to calculate the gradient
      #NOTE: this significantly boosts the speed of training
      indices = np.random.choice(len(X), batch_size, replace=False)
      X_batch = X.iloc[indices]
      y_batch = y.iloc[indices]
      X_batch = X_batch.to_numpy()
      y_batch = y_batch.to_numpy()

      #evaluate class scores on training set
      predictions = self.predict(X_batch)
      predicted_classes = np.argmax(predictions, axis=1)

      if epoch%50 == 0 and verbose:
        print(f"pred: {predicted_classes[:10]}")
        print(f"true: {y_batch[:10]}")


      #compute the loss: average hinge loss
      loss = self.loss(predictions, y_batch)
      self.history[-1]['loss'] = loss


      #compute accuracy on the test set, for an intuitive metric
      accuracy = np.mean(predicted_classes == y_batch)
      self.history[-1]['accuracy'] = accuracy

      #reduce the learning rate as training progresses
      # lr *= 0.999
      # self.history[-1]['lr'] = lr

      if epoch%50 == 0 and verbose:
        print(f"Epoch: {epoch} | Loss: {loss} | Accuracy: {accuracy} | LR: {lr} \n")


      #compute the gradient on the scores assigned by the classifier
      gradient = self.gradient(predictions, X_batch, y_batch)

      #print(gradient)

      #backpropagate the gradient to the weights + bias
      step = gradient * lr

      #perform a parameter update, in the negative direction of the gradient
      self.weights -= step

sm = SVM()
pred = sm.predict(np.array(X_train[0:1]))

sm.fit(X_train, y_train)

我的梯度下降实现有什么问题（带有铰链损失的SVM分类器）

问题描述投票：0回答：1

1个回答

最新问题

我的梯度下降实现有什么问题（带有铰链损失的SVM分类器）

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1