梯度下降：缩减特征集的运行时间比原始特征集更长

Question

我尝试用 Python 实现梯度下降算法来解决机器学习问题。我正在使用的数据集已经过预处理，并且在比较两个数据集（一个具有原始特征，另一个数据集使用奇异值分解（SVD）减少了前一组的维度）时，我在运行时观察到了意外的行为。我始终如一观察到，与减少的数据集相比，较大的原始数据集的梯度下降算法的运行时间较低，这与我的预期相反。鉴于数据集较小，缩减后的数据集的运行时间是否应该更短？我试图理解为什么会发生这种情况。这是相关的代码片段：

import time
import numpy as np
import matplotlib.pyplot as plt

def h_theta(X1, theta1):
    # Implementation of hypothesis function
    return np.dot(X1, theta1)

def j_theta(X1, y1, theta1):
    # Implementation of cost function
    return np.sum((h_theta(X1, theta1) - y1) ** 2) / (2 * X1.size)

def grad(X1, y1, theta):
    # Calculation of gradient
    gradient = np.dot(X1.T, h_theta(X1, theta) - y1) / len(y1)
    return gradient

def gradient_descent(X1, y1):
    theta_initial = np.zeros(X1.shape[1])  # Initialize theta with zeros
    num_iterations = 1000
    learning_rates = [0.1, 0.01, 0.001]  
    cost_iterations = []
    theta_values = []
    start = time.time()
    for alpha in learning_rates:
        theta = theta_initial.copy()
        cost_history = []
        for i in range(num_iterations):
            gradient = grad(X1, y1, theta)
            theta = theta - np.dot(alpha, gradient)
            cost = j_theta(X1, y1, theta)
            cost_history.append(cost)
        cost_iterations.append(cost_history)
        theta_values.append(theta)
    end = time.time()
    print(f"Time taken: {end - start} seconds")
    fig, axs = plt.subplots(len(learning_rates), figsize=(8, 15))  
    for i, alpha in enumerate(learning_rates):
        axs[i].plot(range(num_iterations), cost_iterations[i], label=f'alpha = {alpha}')  
        axs[i].set_title(f'Learning Rate: {alpha}')
        axs[i].set_ylabel('Cost J')
        axs[i].set_xlabel('Number of Iterations')  
        axs[i].legend()
    plt.tight_layout()  
    plt.show()

# code to reduce X to 3 features (columns) using SVD:
# Perform Singular Value decomposition on X and reduce it to 3 columns
U, S, Vt = np.linalg.svd(X_normalized)
# Reduce X to 3 columns
X_reduced = np.dot(X_normalized, Vt[:3].T)

# print the first 5 rows of X_reduced
print("First 5 rows of X_reduced:")
# Normalize X_reduced
X_reduced = (X_reduced - np.mean(X_reduced, axis=0)) / np.std(X_reduced, axis=0)

print("the means and stds of X after being reduced and normalized:\n" ,X_reduced.mean(axis=0), X_reduced.std(axis=0))
# Print the shape of the reduced X to confirm it has only 3 features
print("Shape of X_reduced:", X_reduced.shape)

# Adding the intercept column to X_reduced
X_reduced_with_intercept = np.hstack((intercept_column, X_reduced))


# Example usage
# X_normalized_with_intercept and y_normalized represent the original dataset
# X_reduced_with_intercept and y_normalized represent the reduced dataset

# Performing gradient descent on the original dataset
gradient_descent(X_normalized_with_intercept, y_normalized)

# Performing gradient descent on the reduced dataset
gradient_descent(X_reduced_with_intercept, y_normalized)

在我的梯度下降实现中，与完整数据集相比，什么可能导致减少的数据集始终具有更长的运行时间？任何有关故障排除的见解或建议将不胜感激。

我尝试重写和审查我的实现，但似乎对于大多数学习率和迭代次数的增加，较大特征集的运行时间低于其 SVD 子集。

Answer 1

代码的运行时间似乎主要由簿记和循环操作主导，而不是实际的梯度下降。我修改了您的代码以避免外部 for 循环并分配一个 numpy 数组并写入它而不是依赖较慢的列表：

import time
import numpy as np
import matplotlib.pyplot as plt

def h_theta(X1, theta1):
    # Implementation of hypothesis function
    return np.dot(X1, theta1)

def j_theta(X1, y1, theta1):
    # Implementation of cost function
    return np.sum((h_theta(X1, theta1) - y1) ** 2) / (2 * X1.size)

def grad(X1, y1, theta):
    # Calculation of gradient
    h = h_theta(X1, theta)
    gradient = np.dot(X1.T, h - y1) / y1.shape[0]
    return gradient

def gradient_descent(X1, y1):
    learning_rates = [0.1, 0.01, 0.001]
    num_iterations = 1000

    num_learning_rates = len(learning_rates)
    learning_rates = np.array(learning_rates).reshape(1, -1)
    theta_initial = np.zeros((num_learning_rates, X1.shape[1])).T  # Initialize theta with zeros
    cost_iterations = np.zeros((num_learning_rates, num_iterations))
    theta_values = np.zeros((num_learning_rates, X1.shape[1], 1))
    theta = theta_initial.copy()
    start = time.time()

    #for idx, alpha in enumerate(learning_rates):

    for i in range(num_iterations):
        gradient = grad(X1, y1, theta)
        theta = theta - learning_rates * gradient
        cost = j_theta(X1, y1, theta)
        cost_iterations[:, i] = cost
    end = time.time()
    print(f"Time taken: {end - start} seconds")
    # fig, axs = plt.subplots(len(learning_rates), figsize=(8, 15))
    # for i, alpha in enumerate(learning_rates):
    #     axs[i].plot(range(num_iterations), cost_iterations[i], label=f'alpha = {alpha}')
    #     axs[i].set_title(f'Learning Rate: {alpha}')
    #     axs[i].set_ylabel('Cost J')
    #     axs[i].set_xlabel('Number of Iterations')
    #     axs[i].legend()
    # plt.tight_layout()
    # plt.show()

X_normalized = np.random.randn(3072, 9)
y_normalized = np.random.randn(3072, 1).repeat(3, 1).reshape((3072, 3))
intercept_column = np.random.randn(3072, 1)


# code to reduce X to 3 features (columns) using SVD:
# Perform Singular Value decomposition on X and reduce it to 3 columns
U, S, Vt = np.linalg.svd(X_normalized)
# Reduce X to 3 columns
X_reduced = np.dot(X_normalized, Vt[:3].T)

# print the first 5 rows of X_reduced
print("First 5 rows of X_reduced:")
# Normalize X_reduced
X_reduced = (X_reduced - np.mean(X_reduced, axis=0)) / np.std(X_reduced, axis=0)

print("the means and stds of X after being reduced and normalized:\n" ,X_reduced.mean(axis=0), X_reduced.std(axis=0))
# Print the shape of the reduced X to confirm it has only 3 features
print("Shape of X_reduced:", X_reduced.shape)

# Adding the intercept column to X_reduced
X_reduced_with_intercept = np.hstack((intercept_column, X_reduced))
X_normalized_with_intercept = np.hstack((intercept_column, X_normalized))

# Example usage
# X_normalized_with_intercept and y_normalized represent the original dataset
# X_reduced_with_intercept and y_normalized represent the reduced dataset

# Performing gradient descent on the original dataset

gradient_descent(X_normalized_with_intercept, y_normalized)

# Performing gradient descent on the reduced dataset
gradient_descent(X_reduced_with_intercept, y_normalized)

现在，代码在缩减的数据集上运行得更快。我已经填写了您未指定的空白，以便我能够运行您的代码。希望我没有太严重地破坏梯度下降代码，因为我依赖于有意义的维度，而不是用笔和纸解决问题。

梯度下降：缩减特征集的运行时间比原始特征集更长

问题描述投票：0回答：1

1个回答

最新问题

梯度下降：缩减特征集的运行时间比原始特征集更长

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1