我正在尝试在一些数据上训练(恒等链接泊松)GLM 模型。 我用 R 和 Python 构建了我认为等效的模型。 在 R 中,GLM 无法收敛。 在 Python 中,GLM 确实收敛。 谁能解释一下为什么会这样?
从观察开始(R代码)
b <- c(1607.72888, 1457.32996, 2230.17041, 2371.61865, 2759.7373,
2200.35571, 3700.06104, 75333.8906, 33347.0156, 24970.9219, 5840.49512,
2465.53516, 1882.85522, 1027.7406, 796.484009, 863.635925, 926.715149,
1315.60522, 7734.25879, 255625, 130990.281, 71047.9375, 23927.1992,
15792.1631, 13041.4033, 6688.22559, 2822.23682, 2115.25049, 1618.78552,
1517.86987, 482027.531, 213073.016, 98384.4844, 35729.2227, 21157.3594,
16215.7432, 14006.8438, 4750.69092, 3116.08838, 1554.2738, 587.872559,
0, 0, 64.625, 16.4270325, 37.5073662, 274.369263, 0)
和预测器
M <- c(-7.9675869936053e-06, -2.9465510727696e-06, -2.21335646368051e-06,
3.64491029094931e-05, 1.27785515404535e-05, 3.38522978187467e-05,
0.00062461548108636, 0.0464058660868474, 0.0196678752619104,
0.0153096807169331, 0.00349654677177638, 0.0015296157844073,
0.00101697130553886, 0.00053606322761061, 0.00032545716927196,
0.00015466321949303, 0.00037175222102242, 0.00032327446276574,
0.00455841559776405, 0.169167191854014, 0.0871342536136015, 0.0473945835827427,
0.0161442668339746, 0.010567700966009, 0.00948533330933113, 0.00421478996145066,
0.00184265032175036, 0.00149379880911122, 0.00103486520709283,
0.00093688509002056, 0.298346941396298, 0.133754666017589, 0.0633174899164175,
0.0228254672410591, 0.0134052216553624, 0.0100745170940446, 0.00871104028108377,
0.00284039230023698, 0.00189497902518704, 0.00080088467813307,
1.5570018119882e-05, 5.27906279865597e-05, 5.36030668242488e-05,
2.02817303022873e-05, 2.78819345726129e-05, 2.85829668669995e-05,
1.83105158084491e-05, 4.3102208421767e-06)
或者在Python中观察
b = array([[1.60772888e+03],
[1.45732996e+03],
[2.23017041e+03],
[2.37161865e+03],
[2.75973730e+03],
[2.20035571e+03],
[3.70006104e+03],
[7.53338906e+04],
[3.33470156e+04],
[2.49709219e+04],
[5.84049512e+03],
[2.46553516e+03],
[1.88285522e+03],
[1.02774060e+03],
[7.96484009e+02],
[8.63635925e+02],
[9.26715149e+02],
[1.31560522e+03],
[7.73425879e+03],
[2.55625000e+05],
[1.30990281e+05],
[7.10479375e+04],
[2.39271992e+04],
[1.57921631e+04],
[1.30414033e+04],
[6.68822559e+03],
[2.82223682e+03],
[2.11525049e+03],
[1.61878552e+03],
[1.51786987e+03],
[4.82027531e+05],
[2.13073016e+05],
[9.83844844e+04],
[3.57292227e+04],
[2.11573594e+04],
[1.62157432e+04],
[1.40068438e+04],
[4.75069092e+03],
[3.11608838e+03],
[1.55427380e+03],
[5.87872559e+02],
[0.00000000e+00],
[0.00000000e+00],
[6.46250000e+01],
[1.64270325e+01],
[3.75073662e+01],
[2.74369263e+02],
[0.00000000e+00]])
和预测器
M = array([[-7.96758699e-06],
[-2.94655107e-06],
[-2.21335646e-06],
[ 3.64491029e-05],
[ 1.27785515e-05],
[ 3.38522978e-05],
[ 6.24615481e-04],
[ 4.64058661e-02],
[ 1.96678753e-02],
[ 1.53096807e-02],
[ 3.49654677e-03],
[ 1.52961578e-03],
[ 1.01697131e-03],
[ 5.36063228e-04],
[ 3.25457169e-04],
[ 1.54663219e-04],
[ 3.71752221e-04],
[ 3.23274463e-04],
[ 4.55841560e-03],
[ 1.69167192e-01],
[ 8.71342536e-02],
[ 4.73945836e-02],
[ 1.61442668e-02],
[ 1.05677010e-02],
[ 9.48533331e-03],
[ 4.21478996e-03],
[ 1.84265032e-03],
[ 1.49379881e-03],
[ 1.03486521e-03],
[ 9.36885090e-04],
[ 2.98346941e-01],
[ 1.33754666e-01],
[ 6.33174899e-02],
[ 2.28254672e-02],
[ 1.34052217e-02],
[ 1.00745171e-02],
[ 8.71104028e-03],
[ 2.84039230e-03],
[ 1.89497903e-03],
[ 8.00884678e-04],
[ 1.55700181e-05],
[ 5.27906280e-05],
[ 5.36030668e-05],
[ 2.02817303e-05],
[ 2.78819346e-05],
[ 2.85829669e-05],
[ 1.83105158e-05],
[ 4.31022084e-06]])
我首先尝试使用 R 的统计库构建 glm:
glm(b ~ 0 + M,
family = poisson(identity)))
这无法收敛,并出现错误:
错误:未找到有效的系数集:请提供起始值 另外:警告消息: 以 log(y/mu) 为单位:产生 NaN
然后我构建了以下(我相信是等效的)Python 模型:
poiReg = sm.GLM(b, M, family =
sm.families.Poisson(link =
sm.families.links.identity())).fit()
这个模型收敛到1578382.6635341388
我遇到了与问题中描述的几乎相同的问题。客户想要使用 Poisson GLM 的身份链接,因为他们认为它有更容易的解释。
除了预测负数的潜力之外,正如 Tom Wenseleers 对 OP 的评论中提到的:
R 中对恒等链接泊松 GLM 的支持很差,因为该算法目前并未对拟合系数施加非负约束,而这是恒等链接泊松收敛所必需的。
...对我来说,更改为日志链接解决了问题。