是否可以使用 stats.linregress 从 statsmodels.OLS 复制残差的均方误差?请参阅我的数据集,其中包含下面的两个回归。在 OLS 的情况下,我可以手动复制 mse_resid 方法的结果,并且希望对 linregress 执行相同的操作,并匹配两种方法的 MSE。
from scipy import stats
import statsmodels.api as sm
import numpy as np
x = [4.36944785, 5.11198779, 5.27299956, 5.79909265, 5.26269019, 6.31896811,
5.56452041, 6.82762923, 6.85012617, 6.84481548, 5.58724866, 6.42162227,
6.42162227, 6.69456206, 6.72862861, 6.18620862, 6.68835471, 4.8598124,
6.00881319, 6.36130248, 6.49072353, 5.92958914, 6.25958146, 5.84643878,
6.3952616, 5.65248918, 6.23832463, 6.12905021, 5.29831737, 6.56667243,
5.0937502, 6.26909628, 6.54103, 6.53958596, 6.54103, 6.53958596,
6.72022016, 6.60258789, 6.70563909, 6.31173481, 6.58755001, 6.78105763]
y = [0.07982600432816954, 0.07071547302656801, 0.08305002908189892, 0.09691500731442375,
0.10527951322283996, 0.08553662721188253, 0.09733660673707481, 0.1549839973642277,
0.1772936800000746, 0.08246283701630393, 0.08674307140060505, 0.09188076686886233,
0.08911357473202978, 0.09129630573740526, 0.08960525194538806, 0.08171381972214242,
0.0916409307055752, 0.08059450458219804, 0.08339172699017788, 0.08679014230487617,
0.094905522499886, 0.08928735113983156, 0.08660721314000525, 0.08667870850057602,
0.09188469815760014, 0.08602803836358593, 0.0884816080378508, 0.08737399753878625,
0.08867415695668976, 0.09137142512590624, 0.0928207195650323, 0.0883305622258501,
0.08427867843903124, 0.09098192867485894, 0.08438044851325044, 0.08439333735802305,
0.09041076497326207, 0.0918760449066645, 0.09080763620015583, 0.08698756784494903,
0.09255954314642892, 0.08921757572875405]
mod = sm.OLS(y, sm.add_constant(x, prepend=False))
res = mod.fit()
err = np.sqrt(np.sum(res.resid**2)/(res.df_resid))
print(err)
print(res.mse_resid**0.5)
print()
res = stats.linregress(x, y)
k = res.slope
b = res.intercept
y_pred = [k * x_item + b for x_item in x]
n = len(x)
error = (sum((y[i]-y_pred[i]**2 for i in range(n)))/(n-2))**0.5
print(error)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_true=y, y_pred=y_pred, squared=False)
print(rmse)
为了完整性,将问题评论转换为答案:
在计算误差时括号未正确分组。应该是:
error = (sum((y[i]-y_pred[i])**2 for i in range(n))/(n-2))**0.5