我有一组点,它们的分散图像类似于高斯正态分布。在互联网上搜索有很多如何将曲线拟合到点的Python示例。他们基于:
def Gauss1(X, C, mu, sigma):
return C * np.exp(-(X-mu) ** 2 / (2 * sigma ** 2))
和
from scipy.optimize import curve_fit
这会产生非常适合的曲线。问题是Gauss1不是高斯正态分布,它应该是:
def Gauss2(X, mu, sigma):
return (1 / (sigma * np.sqrt(2 * np.pi))) * np.exp(-(X-mu) ** 2 / (2 * sigma ** 2))
使用 Gauss2 生成下一条消息,并且没有 popt:
OptimizeWarning: Covariance of the parameters could not be estimated
有什么方法可以使用真实高斯分布生成拟合曲线吗?
popt2, pcov2 = curve_fit(Gauss2, x, y)
OptimizeWarning: Covariance of the parameters could not be estimated
我认为积分不需要,但我可以发布25行。
import matplotlib.pyplot as plt # Follow the convention
import numpy as np # Library for working with arrays
from scipy.optimize import curve_fit
######################################## G L O B A L S
# X the ratings after grouping by 23 the chess players from FIDE 2024 Oct Standard
# Y is the average of players in the averaged ratings
XY = (
(1411, 231), (1434, 271), (1457, 281), (1480, 287),
(1503, 292), (1526, 298), (1549, 293), (1572, 299),
(1595, 300), (1618, 303), (1641, 304), (1664, 308),
(1687, 301), (1710, 311), (1733, 304), (1756, 291),
(1779, 286), (1802, 283), (1825, 279), (1848, 268),
(1871, 260), (1894, 243), (1917, 229), (1940, 221),
(1963, 203), (1986, 169), (2009, 134), (2032, 112),
(2055, 102), (2078, 94), (2101, 81), (2124, 76),
(2147, 70), (2170, 61), (2193, 54), (2216, 45),
(2239, 42), (2262, 35), (2285, 31), (2308, 27),
(2331, 26), (2354, 22), (2377, 20), (2400, 18),
(2423, 15), (2446, 10), (2469, 10), (2492, 7),
(2515, 6), (2538, 5), (2561, 4), (2584, 3),
(2607, 3), (2630, 2), (2653, 2), (2676, 2),
(2699, 1), (2722, 2), (2745, 1), (2768, 1),
(2791, 1), (2837, 1))
class c: # Constants
X_LABEL_STEP = 50
Y_LABEL_STEP = 10
A4 = (11.69, 8.27) # W x H in inches
###################################### F U N C T I O N S
def Graph_Gauss():
global c, XY
X = [] ; Y = []
for e in XY:
X += [e[0]]
Y += [e[1]]
# Set up the limits for X, ratings
minX = min(X) ; maxX = max(X)
# Set up the limits for Y, number of players with that rating
minY = min(Y) ; maxY = max(Y)
fig, ax = plt.subplots()
fig.set_size_inches(c.A4)
fig.suptitle("Rating Distribution", fontsize=18, y=0.935)
x1 = [] # Make the x axle
xlb = [] # Make the labels for x
stp = c.X_LABEL_STEP
for k in range(stp*(minX//stp), stp*(maxX//stp + 2), stp):
x1 += [k]
xlb.append(f"{k}")
ax.set_xticks(ticks=x1, labels=xlb, rotation=270)
ax.set_xlabel("Rating", fontsize=15, labelpad=7)
stp = c.Y_LABEL_STEP
yticks = np.arange(stp*(minY//stp), stp*(maxY//stp + 2), stp)
ax.set_ylabel("Number of Players", fontsize=15, rotation=270, labelpad=18)
ax.set_yticks(ticks=yticks)
ax.grid(which="major", axis="both")
ax.scatter(X, Y, color="#000FFF", marker='o', s=14)
# plt.show()
# Fit a normal distribution, aka Gaussian fitting curve
# https://www.wasyresearch.com/tutorial-python-for-fitting-gaussian-distribution-on-data/
# mu = mean = sum(x) / len(x) ; In our case sum(x * y) / sum(y)
# sigma = standard deviation = sqrt((sum((x - mean)**2) / len(x))
#
# https://www.scribbr.com/statistics/normal-distribution/
# 1/(sigma*sqrt(2*pi)) * e**(-(x - mean)**2 / (2 * sigma**2))
# Calculating the Gaussian PDF values given Gaussian parameters and random variable X
def Gauss1(X, C, mu, sigma):
return C * np.exp(-(X-mu)**2 / (2 * sigma**2))
def Gauss2(X, mu, sigma):
return (1/(sigma*np.sqrt(2*np.pi))) * np.exp(-(X-mu)**2 / (2 * sigma**2))
x = np.array(X)
y = np.array(Y)
mu = sum(x * y) / sum(y)
sigma = np.sqrt(sum(y*(x - mu)**2)/sum(y))
print(f"{1/(sigma*np.sqrt(2*np.pi))=:.2f} {mu=:.2f} {sigma=:.2f}")
popt1, pcov1 = curve_fit(Gauss1, x, y, p0=[max(y), mu, sigma], maxfev=5000)
popt2, pcov2 = curve_fit(Gauss2, x, y) # This generates:
# OptimizeWarning: Covariance of the parameters could not be estimated
yg1 = Gauss1(x, *popt1)
yg2 = Gauss2(x, *popt2)
ax.plot(x, yg1, color="#FF0F00", linewidth=3,
label=f"Normal: mu={popt1[1]:.2f}, sigma={popt1[2]:.2f}")
plt.legend(fontsize=12)
breakpoint() # ???? DEBUG
plt.show()
pass # To set a breakpoint
#######################################################################
if __name__ == '__main__':
# breakpoint() # ???? DEBUG, to set other breakpoints
Graph_Gauss()
你有两个重大错误。
您提供的数据是分档频率,而不是概率密度。然而,您尝试拟合的函数 Gauss2 是一个概率密度函数(其积分为 1)。
您还没有提供所有数据,因此从分箱频率转换为概率密度非常困难,因为您不知道总人数! (我在下面估算过。)
分箱频率和 pdf 之间的联系是
binned frequency = number x pdf x dx
其中
number
是总人数,dx
是垃圾箱宽度。因此,您可以将 Gauss2 函数拟合到 Y/(number*dx)
而不是 Y
本身。当您绘制它时,您必须反转它才能获得频率。
我不得不估计
number=10000
,因为你没有提供所有数据;您可以在下面的代码中更正此问题。
改变你的台词
popt1, pcov1 = curve_fit(Gauss1, x, y, p0=[max(y), mu, sigma], maxfev=5000)
popt2, pcov2 = curve_fit(Gauss2, x, y)
yg1 = Gauss1(x, *popt1)
yg2 = Gauss2(x, *popt2)
ax.plot(x, yg1, color="#FF0F00", linewidth=3,
label=f"Normal: mu={popt1[1]:.2f}, sigma={popt1[2]:.2f}")
以下内容(从整个数据集中更正
number
):
popt1, pcov1 = curve_fit(Gauss1, x, y, p0=[max(y), mu, sigma], maxfev=5000)
yg1 = Gauss1(x, *popt1)
number = 10000 # this has been estimated: you didn't provide all of the data
dx = 23 # bin width
popt2, pcov2 = curve_fit(Gauss2, x, y/(number*dx), p0=[mu, sigma] )
yg2 = Gauss2(x, *popt2) * number * dx
ax.plot(x, yg1, color="#FF0000", linewidth=3,
label=f"Normal: mu={popt1[1]:.2f}, sigma={popt1[2]:.2f}")
ax.plot(x, yg2, color="#00FF00", linewidth=3,
label=f"Normal: mu={popt2[0]:.2f}, sigma={popt2[1]:.2f}")