#Encoding Categorical Features
#encoding sex Column
df.replace({'sex':{'male':0, 'female':1}}, inplace = True)
#encoding smoker column
df.replace({'smoker':{'yes':0, 'no':1}}, inplace = True)
#encoding Region Column
df.replace({'region':{'southeast':0, 'southwest':1, 'northeast':2, 'northwest':3}}, inplace = True)
X = df.drop(columns = 'charges', axis=1) Y = df['charges']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=2)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
predict = regressor.predict(X_train) r2_train = metrics.r2_score(Y_train, predict) test_predict = regressor.predict(X_test) r2_test = metrics.r2_score(Y_test, test_predict) print("R Squared Value : ", r2_test)
input_data = (31, 1, 36.63, 2, 1, 0) input_data_asnumpy = np.asarray(input_data)
input_data_reshaped = input_data_asnumpy.reshape(1,-1) prediction = regressor.predict(input_data_reshaped) print(prediction)
我使用了线性回归模型并将数据拆分为 20% 的测试数据和 80% 的训练数据,之后我检查了平方,但它给出的值高于实际值。如何提高此代码或线性回归模型的准确性。