为什么我的输出数据帧形状不是1459 x 2而是1460 x 2

Question

以下是我到目前为止所做的工作。

#importing the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor

filepath = r"C:\Users...Kaggle data\house prediction iowa\house_predtrain (3).csv"
train = pd.read_csv(filepath)
print(train.shape)

filepath2 = r"C:\Users...Kaggle data\house prediction iowa\house_predtest (1).csv"
test = pd.read_csv (filepath2)
print(test.shape)

#first we raplace all the NANs by 0 in botht the train and test data
train = train.fillna(0) 
test = test.fillna(0)    #error one
train.dtypes.value_counts()

#isolating all the object/categorical feature and converting them to numeric features

encode_cols = train.dtypes[train.dtypes == np.object]
encode_cols2 = test.dtypes[test.dtypes == np.object]

#print(encode_cols)

encode_cols = encode_cols.index.tolist()
encode_cols2 = encode_cols2.index.tolist()
print(encode_cols2)

# Do the one hot encoding
train_dummies = pd.get_dummies(train, columns=encode_cols)
test_dummies = pd.get_dummies(test, columns=encode_cols2)

#align your test and train data (error2)
train, test = train_dummies.align(test_dummies, join = 'left', axis = 1)
print(train.shape)
print(test.shape)

#Now working with Floats features

numericals_floats = train.dtypes == np.float
numericals = train.columns[numericals_floats]
print(numericals)

#we check for skewness in the float data

skew_limit = 0.35
skew_vals = train[numericals].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skewness'}))

skew_cols

#Visualising them above data before and after log transforming
%matplotlib inline

field = 'GarageYrBlt'
fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(10,5))
train[field].hist(ax=ax_before)
train[field].apply(np.log1p).hist(ax=ax_after)

ax_before.set (title = 'Before np.log1p', ylabel = 'frequency', xlabel = 'Value')
ax_after.set (title = 'After np.log1p', ylabel = 'frequency', xlabel = 'Value')

fig.suptitle('Field: "{}"'.format (field));
#note how applying log transformation on GarageYrBuilt does not do much 

print(skew_cols.index.tolist()) #returns a list of the values

for i in skew_cols.index.tolist():
    if i == "SalePrice":           #we do not want to transform the feature to be predicted
        continue
    train[i] = train[i].apply(np.log1p)
    test[i]  = test[i].apply(np.log1p)

feature_cols = [x for x in train.columns if x != ('SalePrice')]

X_train = train[feature_cols]
y_train = train['SalePrice']

X_test  = test[feature_cols]


y_test  = train['SalePrice']
print(X_test.shape)
print(y_train.shape)
print(X_train.shape)

#now to the most fun part. Feature engineering is over!!!
#i am going to use linear regression, L1 regularization, L2 regularization and ElasticNet(blend of L1 and L2)
#first up, Linear Regression
alphas =[0.00005, 0.0005, 0.005, 0.05, 0.5, 0.1, 0.3, 1, 3, 5, 10, 25, 50, 100]    #i choosed this
l1_ratios = np.linspace(0.1, 0.9, 9)

#LinearRegression
linearRegression = LinearRegression().fit(X_train, y_train)
prediction1 = linearRegression.predict(X_test)
LR_score = linearRegression.score(X_train, y_train)
print(LR_score)

#ridge
ridgeCV = RidgeCV(alphas=alphas).fit(X_train, y_train)
prediction2 = ridgeCV.predict(X_test)
R_score = ridgeCV.score(X_train, y_train)
print(R_score)

#lasso
lassoCV = LassoCV(alphas=alphas, max_iter=1e2).fit(X_train, y_train)
prediction3 = lassoCV.predict(X_test)
L_score = lassoCV.score(X_train, y_train)
print(L_score)

#elasticNetCV
elasticnetCV = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, max_iter=1e2).fit(X_train, y_train)
prediction4 = elasticnetCV.predict(X_test)
EN_score = elasticnetCV.score(X_train, y_train)
print(EN_score)

from sklearn.ensemble import RandomForestRegressor
randfr = RandomForestRegressor()
randfr = randfr.fit(X_train, y_train)
prediction5 = randfr.predict(X_test)
print(prediction5.shape)
RF_score = randfr.score(X_train, y_train)
print(RF_score)

#putting it lall together

rmse_vals = [LR_score, R_score, L_score, EN_score, RF_score]

labels = ['Linear', 'Ridge', 'Lasso', 'ElasticNet', 'RandomForest']

rmse_df = pd.Series(rmse_vals, index=labels).to_frame()
rmse_df.rename(columns={0: 'SCORES'}, inplace=1)
rmse_df

\\KaggleHouse_submission_1 = pd.DataFrame({'Id': test.Id, 'SalePrice': prediction5})
KaggleHouse_submission_1 = KaggleHouse_submission_1
print(KaggleHouse_submission_1.shape)

在kaggle house预测中，有一个火车数据集和一个测试数据集。这里是实际数据link的链接。输出数据帧大小应为1459 X 2，但由于某种原因，我的输出数据帧大小为1460 X 2。我不确定为什么会这样。任何反馈都非常感谢。

Answer 1

在以下行中：

test = train.fillna(0)

你用“火车”数据分配（覆盖）test变量......

Answer 2

Scikit learn对列的排序非常敏感，因此如果您的列车数据集和测试数据集未对齐，您可能会遇到与上述类似的问题。因此，您需要首先使用以下对齐命令确保测试数据的编码与列车数据相同。

train, test = train_dummies.align(test_dummies, join='left', axis = 1)

看看上面代码的变化

为什么我的输出数据帧形状不是1459 x 2而是1460 x 2

问题描述投票：-2回答：2

2个回答

最新问题

为什么我的输出数据帧形状不是1459 x 2而是1460 x 2

问题描述 投票：-2回答：2

2个回答

最新问题

问题描述投票：-2回答：2