我正在阅读《掌握Python数据分析》,我发现教科书中的一些代码似乎无法正确编译,我对Python不是最熟悉,所以我无法弄清楚发生了什么错误的。教科书显示了旨在用于绘制累积分布函数的代码,但是当我调用此函数时,它似乎会抛出错误。如果有人对可能出现的问题有一些见解,我将不胜感激学习这一点的帮助。
我对这段代码的导入:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
这是教科书中所示的函数,用于绘制累积分布(注意,我添加了打印语句来尝试找出它的突破点):
# define a function that can plot cumulative distribution function (cdf)
def plot_cdf(data, plot_range=None, scale_to=None, **kwargs):
print("inputs received: data: {}, plot range: {}, scale to: {}".format(data, plot_range, scale_to))
num_bins = len(data)
sorted_data = np.array(sorted(data), dtype=np.float64)
data_range = sorted_data[-1] - sorted_data[0]
counts, bin_edges = np.histogram(sorted_data, bins=num_bins)
print("bin edges: {}".format(bin_edges))
xvalues = bin_edges[:1]
print("counts: {}, np.cumsum: {}".format(counts, np.cumsum(counts)))
yvalues = np.cumsum(counts)
if plot_range is None:
xmin = sorted_data[0]
xmax = sorted_data[-1]
else:
xmin, xmax = plot_range
#pad the arrays
xvalues = np.concatenate([[xmin, xvalues[0]], xvalues, [xmax]])
yvalues = np.concatenate([[0.0, 0.0], yvalues, [yvalues.max()]])
print("x values: {}".format(xvalues))
print("y values: {}".format(yvalues))
if scale_to is not None:
yvalues = yvalues / len(data) * scale_to
plt.axis([xmin, xmax, 0, yvalues.max()])
return plt.plot(xvalues, yvalues, **kwargs)
这是课本上展示的导致错误的方法:
xmin = 0
xmax = 3.5
xx = np.linspace(xmin,xmax,200)
plt.plot(xx, rvweib.cdf(xx), color='orange', lw=5)
plot_cdf(weib_variates, plot_range=[xmin, xmax], scale_to=1, lw=2, color='green')
plt.axis([xmin, xmax, 0, 1])
plt.title('Weibul distribution simulation', fontsize=14)
plt.xlabel('Failure Time', fontsize=12);
这是我收到的错误消息:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[19], line 5
3 xx = np.linspace(xmin,xmax,200)
4 plt.plot(xx, rvweib.cdf(xx), color='orange', lw=5)
----> 5 plot_cdf(weib_variates, plot_range=[xmin, xmax], scale_to=1, lw=2, color='green')
6 plt.axis([xmin, xmax, 0, 1])
7 plt.title('Weibul distribution simulation', fontsize=14)
Cell In[18], line 25, in plot_cdf(data, plot_range, scale_to, **kwargs)
23 yvalues = yvalues / len(data) * scale_to
24 plt.axis([xmin, xmax, 0, yvalues.max()])
---> 25 return plt.plot(xvalues, yvalues, **kwargs)
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\matplotlib\pyplot.py:3794, in plot(scalex, scaley, data, *args, **kwargs)
3786 @_copy_docstring_and_deprecators(Axes.plot)
3787 def plot(
3788 *args: float | ArrayLike | str,
(...)
3792 **kwargs,
3793 ) -> list[Line2D]:
-> 3794 return gca().plot(
3795 *args,
3796 scalex=scalex,
3797 scaley=scaley,
3798 **({"data": data} if data is not None else {}),
3799 **kwargs,
3800 )
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\matplotlib\axes\_axes.py:1779, in Axes.plot(self, scalex, scaley, data, *args, **kwargs)
1536 """
1537 Plot y versus x as lines and/or markers.
1538
(...)
1776 (``'green'``) or hex strings (``'#008000'``).
1777 """
1778 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D)
-> 1779 lines = [*self._get_lines(self, *args, data=data, **kwargs)]
1780 for line in lines:
1781 self.add_line(line)
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\matplotlib\axes\_base.py:296, in _process_plot_var_args.__call__(self, axes, data, *args, **kwargs)
294 this += args[0],
295 args = args[1:]
--> 296 yield from self._plot_args(
297 axes, this, kwargs, ambiguous_fmt_datakey=ambiguous_fmt_datakey)
File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\matplotlib\axes\_base.py:486, in _process_plot_var_args._plot_args(self, axes, tup, kwargs, return_kwargs, ambiguous_fmt_datakey)
483 axes.yaxis.update_units(y)
485 if x.shape[0] != y.shape[0]:
--> 486 raise ValueError(f"x and y must have same first dimension, but "
487 f"have shapes {x.shape} and {y.shape}")
488 if x.ndim > 2 or y.ndim > 2:
489 raise ValueError(f"x and y can be no greater than 2D, but have "
490 f"shapes {x.shape} and {y.shape}")
ValueError: x and y must have same first dimension, but have shapes (4,) and (503,)
您的问题的答案在于回溯。尺寸不匹配。
检查
bin_edges[:1]
,可能您的意思是bin_edges[1:]
。
我想这是一个证明你理解的练习,日常使用即可
plt.hist(data, bins=num_bins, density=True, cumulative=True, histtype='step')