以下是我到目前为止所做的工作。
#importing the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
filepath = r"C:\Users...Kaggle data\house prediction iowa\house_predtrain (3).csv"
train = pd.read_csv(filepath)
print(train.shape)
filepath2 = r"C:\Users...Kaggle data\house prediction iowa\house_predtest (1).csv"
test = pd.read_csv (filepath2)
print(test.shape)
#first we raplace all the NANs by 0 in botht the train and test data
train = train.fillna(0)
test = test.fillna(0) #error one
train.dtypes.value_counts()
#isolating all the object/categorical feature and converting them to numeric features
encode_cols = train.dtypes[train.dtypes == np.object]
encode_cols2 = test.dtypes[test.dtypes == np.object]
#print(encode_cols)
encode_cols = encode_cols.index.tolist()
encode_cols2 = encode_cols2.index.tolist()
print(encode_cols2)
# Do the one hot encoding
train_dummies = pd.get_dummies(train, columns=encode_cols)
test_dummies = pd.get_dummies(test, columns=encode_cols2)
#align your test and train data (error2)
train, test = train_dummies.align(test_dummies, join = 'left', axis = 1)
print(train.shape)
print(test.shape)
#Now working with Floats features
numericals_floats = train.dtypes == np.float
numericals = train.columns[numericals_floats]
print(numericals)
#we check for skewness in the float data
skew_limit = 0.35
skew_vals = train[numericals].skew()
skew_cols = (skew_vals
.sort_values(ascending=False)
.to_frame()
.rename(columns={0:'Skewness'}))
skew_cols
#Visualising them above data before and after log transforming
%matplotlib inline
field = 'GarageYrBlt'
fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(10,5))
train[field].hist(ax=ax_before)
train[field].apply(np.log1p).hist(ax=ax_after)
ax_before.set (title = 'Before np.log1p', ylabel = 'frequency', xlabel = 'Value')
ax_after.set (title = 'After np.log1p', ylabel = 'frequency', xlabel = 'Value')
fig.suptitle('Field: "{}"'.format (field));
#note how applying log transformation on GarageYrBuilt does not do much
print(skew_cols.index.tolist()) #returns a list of the values
for i in skew_cols.index.tolist():
if i == "SalePrice": #we do not want to transform the feature to be predicted
continue
train[i] = train[i].apply(np.log1p)
test[i] = test[i].apply(np.log1p)
feature_cols = [x for x in train.columns if x != ('SalePrice')]
X_train = train[feature_cols]
y_train = train['SalePrice']
X_test = test[feature_cols]
y_test = train['SalePrice']
print(X_test.shape)
print(y_train.shape)
print(X_train.shape)
#now to the most fun part. Feature engineering is over!!!
#i am going to use linear regression, L1 regularization, L2 regularization and ElasticNet(blend of L1 and L2)
#first up, Linear Regression
alphas =[0.00005, 0.0005, 0.005, 0.05, 0.5, 0.1, 0.3, 1, 3, 5, 10, 25, 50, 100] #i choosed this
l1_ratios = np.linspace(0.1, 0.9, 9)
#LinearRegression
linearRegression = LinearRegression().fit(X_train, y_train)
prediction1 = linearRegression.predict(X_test)
LR_score = linearRegression.score(X_train, y_train)
print(LR_score)
#ridge
ridgeCV = RidgeCV(alphas=alphas).fit(X_train, y_train)
prediction2 = ridgeCV.predict(X_test)
R_score = ridgeCV.score(X_train, y_train)
print(R_score)
#lasso
lassoCV = LassoCV(alphas=alphas, max_iter=1e2).fit(X_train, y_train)
prediction3 = lassoCV.predict(X_test)
L_score = lassoCV.score(X_train, y_train)
print(L_score)
#elasticNetCV
elasticnetCV = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, max_iter=1e2).fit(X_train, y_train)
prediction4 = elasticnetCV.predict(X_test)
EN_score = elasticnetCV.score(X_train, y_train)
print(EN_score)
from sklearn.ensemble import RandomForestRegressor
randfr = RandomForestRegressor()
randfr = randfr.fit(X_train, y_train)
prediction5 = randfr.predict(X_test)
print(prediction5.shape)
RF_score = randfr.score(X_train, y_train)
print(RF_score)
#putting it lall together
rmse_vals = [LR_score, R_score, L_score, EN_score, RF_score]
labels = ['Linear', 'Ridge', 'Lasso', 'ElasticNet', 'RandomForest']
rmse_df = pd.Series(rmse_vals, index=labels).to_frame()
rmse_df.rename(columns={0: 'SCORES'}, inplace=1)
rmse_df
\\KaggleHouse_submission_1 = pd.DataFrame({'Id': test.Id, 'SalePrice': prediction5})
KaggleHouse_submission_1 = KaggleHouse_submission_1
print(KaggleHouse_submission_1.shape)
在kaggle house预测中,有一个火车数据集和一个测试数据集。这里是实际数据link的链接。输出数据帧大小应为1459 X 2,但由于某种原因,我的输出数据帧大小为1460 X 2。我不确定为什么会这样。任何反馈都非常感谢。
在以下行中:
test = train.fillna(0)
你用“火车”数据分配(覆盖)test
变量......
Scikit learn对列的排序非常敏感,因此如果您的列车数据集和测试数据集未对齐,您可能会遇到与上述类似的问题。因此,您需要首先使用以下对齐命令确保测试数据的编码与列车数据相同。
train, test = train_dummies.align(test_dummies, join='left', axis = 1)
看看上面代码的变化