我有这段代码,但它必须迭代超过 2000 列和 3000 行,有人可以帮我避免 for 循环吗? 数据是取自yahoo finance pack的经典数据,所以它们有adj close, close, high, low, volume;为了使模型具有可比性,我下载了数据并将它们保存到文件“data full 10”中。
这是我的代码,有人可以帮我创建一个函数来避免它吗? 谢谢你的建议
import pandas as pd
from pandas_datareader import data as pdr
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta
# Load data
data = pd.read_csv("data 10 full.csv", index_col=[0], header=[0,1])
# Get number of stocks
num_stock = len(data['Adj Close'].columns)
# Define window size
window_size = 60
# Define grid for hyperparameter tuning
grid_rf = {
'n_estimators': [500],
'max_depth': [5,10,15,20,25,30],
'min_samples_split': [2,5,10,15,20,25,30],
'min_samples_leaf': [1,5,10,15,20,25,30]
}
# Initialize predictions array
predictions = []
# Define a function to perform the random search and return the trained model
def train_model(x_train, y_train):
# Drop rows that are not present in both datasets
idx = x_train.index.intersection(y_train.index)
x_train = x_train.loc[idx]
y_train = y_train.loc[idx]
# Fit randomized search cross validation
model = RandomForestRegressor()
rscv = RandomizedSearchCV(estimator=model, param_distributions=grid_rf, cv=3, n_jobs=-1, verbose=2, n_iter=40)
x_train = x_train.fillna(0)
y_train = y_train.fillna(0)
rscv_fit = rscv.fit(x_train, y_train)
best_parameters = rscv_fit.best_params_
# Train model
model = RandomForestRegressor(n_estimators=best_parameters['n_estimators'],
min_samples_split=best_parameters['min_samples_split'],
min_samples_leaf=best_parameters['min_samples_leaf'],
max_depth=best_parameters['max_depth'])
model = model.fit(x_train, y_train)
return model
# Define a function to make predictions on a single window
def predict_window(window, model):
x_test = window.iloc[-1,:-num_stock].fillna(0)
prediction = model.predict(x_test.values.reshape(1, -1))
return prediction[0]
# Use rolling to iterate over windows
models = []
for i in range(window_size+1, len(data)):
# Get current window
window = data.iloc[i-window_size-1:i]
# Define training and testing data
x_train = window.iloc[:-1,:-num_stock].fillna(0)
y_train = window.iloc[:-1,num_stock:].fillna(0).shift(-1).dropna()
# Train model and store it
model = train_model(x_train, y_train)
models.append(model)
# Make predictions and append to list
prediction = predict_window(window, model)
predictions.append(prediction)
# Convert predictions to dataframe and set index
predictions_df = pd.DataFrame(predictions, columns=data.columns.get_level_values(0))
predictions_df.set_index(data.index[window_size+1:], inplace=True)
# Calculate root mean squared error
rmse = np.sqrt(mean_squared_error(data.iloc[window_size+1:, :num_stock], predictions_df))
print('RMSE: ', rmse)
# Plot actual and predicted data
import matplotlib.pyplot as plt
plt.plot(data.index, data.iloc[:, :num_stock])
plt.plot(predictions_df.index, predictions_df)
plt.legend(['Actual', 'Predicted'])
plt.show()