我在将这个R-code翻译成Python时遇到问题:statsmodels.GLM给出了正确的结果
import numpy as np
import pandas as pd
from scipy.special import expit # overflow encountered in exp
import statsmodels.api as sm
from matplotlib import pyplot as plt
# data
y= np.array([2,3,6,7,8,9,10,12,15], dtype = np.float)
x= np.array([1,1,1,1,1,1,1,1,1,-1, -1, 0, 0, 0, 0, 1, 1, 1], dtype = np.float).reshape(2,9).T # with intercept in 1st col
data= pd.DataFrame({"y": y, "intercept": x[:,0], "x": x[:,1]})
print(data)
# for glm function
x1= np.array([-1, -1, 0, 0, 0, 0, 1, 1, 1], dtype = np.float)
_x1 = sm.add_constant(x1, prepend=False)
_y = y[:, np.newaxis]
########################
## GLM
#######################
# implement-poisson-regression
from statsmodels.genmod.cov_struct import (Exchangeable, Independence,Autoregressive)
from statsmodels.genmod.families import Poisson
ind= ind = Independence()
fam= sm.families.links.log() ## Log-Linear Regression, also known as Poisson Regression
##data.endog DV, data.exog IV
poisson_model = sm.GLM( _y, _x1, cov_struct=ind, family = Poisson(link= fam))
poisson_results = poisson_model.fit( atol=1e-8) # attach_wls=True,
print(poisson_results.summary())
##print(poisson_results.params)
###########
# https://www.statsmodels.org/devel/examples/notebooks/generated/glm.html
# Plot yhat vs y:
from statsmodels.graphics.api import abline_plot
yhat = poisson_results.mu
fig, ax = plt.subplots()
ax.scatter(yhat, _y)
line_fit = sm.OLS(_y, sm.add_constant(yhat, prepend=True)).fit()
abline_plot(model_results=line_fit, ax=ax)
ax.set_title('Model Fit Plot')
ax.set_ylabel('Observed values')
ax.set_xlabel('Fitted values');
plt.show()
sklearn给出了不同的
betas
- log_scaling_preprocessing中的问题是因为lambda param的性质吗?仍然无法达到正确的 coef=0.6698 &截距=1.8893
#######################
## SKLEARN
#######################
from sklearn import linear_model
regr = linear_model.PoissonRegressor(tol= 1e-8)
regr.fit(data[[ "x"]], data.y)
print("\nPoissonRegressor")
print(regr.score(data[[ "x"]], data.y))
print("coef_ ", regr.coef_)
print("intercept_ ", regr.intercept_)
但上面链接的文章的主要问题是手工编码的牛顿拉夫森算法,因为它似乎发散,而不是收敛,尽管编码方式类似于链接的R代码——我在翻译逻辑时的错误在哪里到Python?
w
_matrix是否应该以另一种方式计算?
def poisson_Newton_Raphson(x,y,b_init,tol=1e-8):
change = np.inf
b_old = b_init
while(change > tol) :
eta = x @ b_old # linear predictor
w = np.diag(expit(eta))
temp = x.T @ w @ x # ?? np.linalg.inv(x.T @ w @ x)
b_new = b_old + (temp @ x.T @ (y-expit(eta)))
change= np.linalg.norm(b_old - b_new, 2)
## print(change)
b_old = b_new
print(b_old)
return b_new
res= poisson_Newton_Raphson(x, y, np.zeros(2))
print(res)
如何使所有 3 个代码给出相同的结果? (在 GLM_example 中正确)
附注替代 - 无论如何,我的
w
矩阵似乎是错误的 - 如何纠正它?
P.P.S。 泊松回归对计数或频率建模,例如对于非常数 λ,泊松分布的 方差等于其均值,随着 lambda 变大,泊松分布看起来越来越像正态分布,解释
P.P.P.S。 glm-课程_001
您列出的所有方法都给出完全相同的结果。这是一个显示:
x = np.array([ 1.,-1,1,-1,1,0,1,0,1,0,1,0,1,1,1,1,1,1]).reshape(-1,2)
y = np.array([2.0, 3.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 15.0])
def poisson_Newton_Raphson(x, y, b_init, tol = 1e-8):
change = 1
b = np.array(b_init) + 0.0
while change > tol:
eta = x @ b # linear predictor
w = np.diag(np.exp(eta))
direction = np.linalg.solve(x.T @ w @ x, x.T @ (y - np.exp(eta)))
change = np.linalg.norm(direction)
b += direction
return b
poisson_Newton_Raphson(x,y,[1,1])
array([1.889272 , 0.6697856])
from sklearn.linear_model import PoissonRegressor
regr = PoissonRegressor(fit_intercept = False, tol = 1e-8, alpha = 0)
regr.fit(x, y).coef_
array([1.88927199, 0.66978561])
from statsmodels.discrete.discrete_model import Poisson
Poisson(y, x).fit(disp = 0).params
array([1.889272 , 0.6697856])