我尝试用 Python 实现梯度下降算法来解决机器学习问题。我正在使用的数据集已经过预处理,并且在比较两个数据集(一个具有原始特征,另一个数据集使用奇异值分解(SVD)减少了前一组的维度)时,我在运行时观察到了意外的行为。我始终如一观察到,与减少的数据集相比,较大的原始数据集的梯度下降算法的运行时间较低,这与我的预期相反。鉴于数据集较小,缩减后的数据集的运行时间是否应该更短?我试图理解为什么会发生这种情况。 这是相关的代码片段:
import time
import numpy as np
import matplotlib.pyplot as plt
def h_theta(X1, theta1):
# Implementation of hypothesis function
return np.dot(X1, theta1)
def j_theta(X1, y1, theta1):
# Implementation of cost function
return np.sum((h_theta(X1, theta1) - y1) ** 2) / (2 * X1.size)
def grad(X1, y1, theta):
# Calculation of gradient
gradient = np.dot(X1.T, h_theta(X1, theta) - y1) / len(y1)
return gradient
def gradient_descent(X1, y1):
theta_initial = np.zeros(X1.shape[1]) # Initialize theta with zeros
num_iterations = 1000
learning_rates = [0.1, 0.01, 0.001]
cost_iterations = []
theta_values = []
start = time.time()
for alpha in learning_rates:
theta = theta_initial.copy()
cost_history = []
for i in range(num_iterations):
gradient = grad(X1, y1, theta)
theta = theta - np.dot(alpha, gradient)
cost = j_theta(X1, y1, theta)
cost_history.append(cost)
cost_iterations.append(cost_history)
theta_values.append(theta)
end = time.time()
print(f"Time taken: {end - start} seconds")
fig, axs = plt.subplots(len(learning_rates), figsize=(8, 15))
for i, alpha in enumerate(learning_rates):
axs[i].plot(range(num_iterations), cost_iterations[i], label=f'alpha = {alpha}')
axs[i].set_title(f'Learning Rate: {alpha}')
axs[i].set_ylabel('Cost J')
axs[i].set_xlabel('Number of Iterations')
axs[i].legend()
plt.tight_layout()
plt.show()
# code to reduce X to 3 features (columns) using SVD:
# Perform Singular Value decomposition on X and reduce it to 3 columns
U, S, Vt = np.linalg.svd(X_normalized)
# Reduce X to 3 columns
X_reduced = np.dot(X_normalized, Vt[:3].T)
# print the first 5 rows of X_reduced
print("First 5 rows of X_reduced:")
# Normalize X_reduced
X_reduced = (X_reduced - np.mean(X_reduced, axis=0)) / np.std(X_reduced, axis=0)
print("the means and stds of X after being reduced and normalized:\n" ,X_reduced.mean(axis=0), X_reduced.std(axis=0))
# Print the shape of the reduced X to confirm it has only 3 features
print("Shape of X_reduced:", X_reduced.shape)
# Adding the intercept column to X_reduced
X_reduced_with_intercept = np.hstack((intercept_column, X_reduced))
# Example usage
# X_normalized_with_intercept and y_normalized represent the original dataset
# X_reduced_with_intercept and y_normalized represent the reduced dataset
# Performing gradient descent on the original dataset
gradient_descent(X_normalized_with_intercept, y_normalized)
# Performing gradient descent on the reduced dataset
gradient_descent(X_reduced_with_intercept, y_normalized)
在我的梯度下降实现中,与完整数据集相比,什么可能导致减少的数据集始终具有更长的运行时间?任何有关故障排除的见解或建议将不胜感激。
我尝试重写和审查我的实现,但似乎对于大多数学习率和迭代次数的增加,较大特征集的运行时间低于其 SVD 子集。
代码的运行时间似乎主要由簿记和循环操作主导,而不是实际的梯度下降。我修改了您的代码以避免外部 for 循环并分配一个 numpy 数组并写入它而不是依赖较慢的列表:
import time
import numpy as np
import matplotlib.pyplot as plt
def h_theta(X1, theta1):
# Implementation of hypothesis function
return np.dot(X1, theta1)
def j_theta(X1, y1, theta1):
# Implementation of cost function
return np.sum((h_theta(X1, theta1) - y1) ** 2) / (2 * X1.size)
def grad(X1, y1, theta):
# Calculation of gradient
h = h_theta(X1, theta)
gradient = np.dot(X1.T, h - y1) / y1.shape[0]
return gradient
def gradient_descent(X1, y1):
learning_rates = [0.1, 0.01, 0.001]
num_iterations = 1000
num_learning_rates = len(learning_rates)
learning_rates = np.array(learning_rates).reshape(1, -1)
theta_initial = np.zeros((num_learning_rates, X1.shape[1])).T # Initialize theta with zeros
cost_iterations = np.zeros((num_learning_rates, num_iterations))
theta_values = np.zeros((num_learning_rates, X1.shape[1], 1))
theta = theta_initial.copy()
start = time.time()
#for idx, alpha in enumerate(learning_rates):
for i in range(num_iterations):
gradient = grad(X1, y1, theta)
theta = theta - learning_rates * gradient
cost = j_theta(X1, y1, theta)
cost_iterations[:, i] = cost
end = time.time()
print(f"Time taken: {end - start} seconds")
# fig, axs = plt.subplots(len(learning_rates), figsize=(8, 15))
# for i, alpha in enumerate(learning_rates):
# axs[i].plot(range(num_iterations), cost_iterations[i], label=f'alpha = {alpha}')
# axs[i].set_title(f'Learning Rate: {alpha}')
# axs[i].set_ylabel('Cost J')
# axs[i].set_xlabel('Number of Iterations')
# axs[i].legend()
# plt.tight_layout()
# plt.show()
X_normalized = np.random.randn(3072, 9)
y_normalized = np.random.randn(3072, 1).repeat(3, 1).reshape((3072, 3))
intercept_column = np.random.randn(3072, 1)
# code to reduce X to 3 features (columns) using SVD:
# Perform Singular Value decomposition on X and reduce it to 3 columns
U, S, Vt = np.linalg.svd(X_normalized)
# Reduce X to 3 columns
X_reduced = np.dot(X_normalized, Vt[:3].T)
# print the first 5 rows of X_reduced
print("First 5 rows of X_reduced:")
# Normalize X_reduced
X_reduced = (X_reduced - np.mean(X_reduced, axis=0)) / np.std(X_reduced, axis=0)
print("the means and stds of X after being reduced and normalized:\n" ,X_reduced.mean(axis=0), X_reduced.std(axis=0))
# Print the shape of the reduced X to confirm it has only 3 features
print("Shape of X_reduced:", X_reduced.shape)
# Adding the intercept column to X_reduced
X_reduced_with_intercept = np.hstack((intercept_column, X_reduced))
X_normalized_with_intercept = np.hstack((intercept_column, X_normalized))
# Example usage
# X_normalized_with_intercept and y_normalized represent the original dataset
# X_reduced_with_intercept and y_normalized represent the reduced dataset
# Performing gradient descent on the original dataset
gradient_descent(X_normalized_with_intercept, y_normalized)
# Performing gradient descent on the reduced dataset
gradient_descent(X_reduced_with_intercept, y_normalized)
现在,代码在缩减的数据集上运行得更快。我已经填写了您未指定的空白,以便我能够运行您的代码。希望我没有太严重地破坏梯度下降代码,因为我依赖于有意义的维度,而不是用笔和纸解决问题。